diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index 8d065f135..904f0c9e7 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNCSCAL
-#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
 \
 void PASTEF772S(chx,cha,blasname) \
      ( \
@@ -50,44 +50,42 @@ void PASTEF772S(chx,cha,blasname) \
      ) \
 { \
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
-	dim_t    n0; \
-	ftype_x* x0; \
-	inc_t    incx0; \
-	ftype_x  alpha_cast; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
 \
-	if (*n == 0 || alpha == NULL) { \
+	dim_t n0 = (dim_t)(*n); \
+	ftype_x *x0 = x; \
+	inc_t incx0 = (inc_t)(*incx); \
+\
+	if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
+	{ \
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
 		return ; \
 	} \
-\
-	/* Convert/typecast negative values of n to zero. */ \
-	bli_convert_blas_dim1( *n, n0 ); \
-\
-	/* If the input increments are negative, adjust the pointers so we can
-	   use positive increments instead. */ \
-	bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
 \
 	/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
+	ftype_x  alpha_cast; \
 	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
 \
 	/* Call BLIS interface. */ \
+	/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
 	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
-	  n0, \
+	  -n0, \
 	  &alpha_cast, \
 	  x0, incx0, \
 	  NULL, \
 	  NULL  \
 	); \
 \
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
 }\
diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c
index 837b3f62a..7dad24a3c 100644
--- a/frame/compat/bla_scal_amd.c
+++ b/frame/compat/bla_scal_amd.c
@@ -50,13 +50,16 @@
 
   1. When alpha == NaN - Propogate the NaN to the vector
   2. When alpha == 0 - Perform the SCALV operation completely and don't use setv.
+     As SCALV kernels are used in many other BLAS APIs where we want setv to be
+     used in this scenario, here we call the kernels with n=-n to signify that
+     setv should not be used.
 */
 
 //
 // Define BLAS-to-BLIS interfaces.
 //
 #undef  GENTFUNCSCAL
-#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
+#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
 \
 void PASTEF772S(chx,cha,blasname) \
      ( \
@@ -66,55 +69,42 @@ void PASTEF772S(chx,cha,blasname) \
      ) \
 { \
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
-	dim_t    n0; \
-	ftype_x* x0; \
-	inc_t    incx0; \
-	ftype_x  alpha_cast; \
 \
 	/* Initialize BLIS. */ \
 	bli_init_auto(); \
 \
-	if (*n == 0 || alpha == NULL) { \
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
-		return ; \
-	} \
+	dim_t n0 = (dim_t)(*n); \
+	ftype_x *x0 = x; \
+	inc_t incx0 = (inc_t)(*incx); \
 \
-	/* Convert/typecast negative values of n to zero. */ \
-	bli_convert_blas_dim1( *n, n0 ); \
-\
-	/* If the input increments are less than or equal to zero, return. */ \
-	if ( (*incx) <= 0 ) { \
+	if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
+	{ \
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
 		return ; \
-	} else { \
-		incx0 = ( inc_t )(*incx); \
-		x0 = (x); \
 	} \
 \
 	/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
 	   that is, we just always sub-optimally implement those cases
 	   by casting alpha to ctype_x (potentially the complex domain) and
 	   using the homogeneous datatype instance according to that type. */ \
+	ftype_x  alpha_cast; \
 	PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
-\
-	/* If alpha is a unit scalar, return early. */ \
-	if ( PASTEMAC(c, eq1)(alpha_cast) ) { \
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
-		return ; \
-	} \
 \
 	/* Call BLIS interface. */ \
+	/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
 	PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
-	  n0, \
+	  -n0, \
 	  &alpha_cast, \
 	  x0, incx0, \
 	  NULL, \
 	  NULL  \
 	); \
 \
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
 }\
@@ -139,82 +129,72 @@ void sscal_blis_impl
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
-    dim_t  n0;
-    float* x0;
-    inc_t  incx0;
+
     /* Initialize BLIS. */
     //bli_init_auto();
 
-    if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha))
+    dim_t n0 = (dim_t)(*n);
+    float *x0 = x;
+    inc_t incx0 = (inc_t)(*incx);
+
+    /*
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
+    */
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
     {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return;
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
+        return;
     }
 
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else          n0 = ( dim_t )(*n);
-
-    /* If the input increments are less than or equal to zero, return. */
-    if ( (*incx) <= 0 )
-    {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return ;
-    }
-    else
-    {
-      x0    = (x);
-      incx0 = ( inc_t )(*incx);
-    }
+    // Definition of function pointer
+    sscalv_ker_ft scalv_ker_ptr;
 
     cntx_t *cntx = NULL;
 
     // Query the architecture ID
     arch_t id = bli_arch_query_id();
 
-    /*
-      Function pointer declaration for the function
-      that will be used by this API
-    */
-    sscalv_ker_ft scalv_ker_ptr; // DSCALV
-
     // Pick the kernel based on the architecture ID
     switch (id)
     {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-        scalv_ker_ptr = bli_sscalv_zen_int_avx512;
-
-        break;
+          scalv_ker_ptr = bli_sscalv_zen_int_avx512;
+          break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
-        scalv_ker_ptr = bli_sscalv_zen_int10;
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
+          scalv_ker_ptr = bli_sscalv_zen_int10;
+          break;
 
-        break;
-      default:
+        default:
 
-        // For non-Zen architectures, query the context
-        cntx = bli_gks_query_cntx();
+          // For non-Zen architectures, query the context
+          cntx = bli_gks_query_cntx();
 
-        // Query the context for the kernel function pointers for sscalv
-        scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
+          // Query the context for the kernel function pointers for sscalv
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
     }
 
+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
     scalv_ker_ptr
     (
       BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
       (float *)alpha,
       x0, incx0,
       cntx
     );
 
-    /* Finalize BLIS. */
-    //    bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
 #ifdef BLIS_ENABLE_BLAS
 void sscal_
@@ -224,7 +204,7 @@ void sscal_
        float*   x, const f77_int* incx
      )
 {
-  sscal_blis_impl( n, alpha, x, incx );
+    sscal_blis_impl( n, alpha, x, incx );
 }
 #endif
 void dscal_blis_impl
@@ -236,65 +216,54 @@ void dscal_blis_impl
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
-    dim_t  n_elem;
-#ifdef BLIS_ENABLE_OPENMP
-    dim_t  ST_THRESH = 30000;
-#endif
-    double* x0;
-    inc_t  incx0;
 
-    /* Initialize BLIS  */
+    /* Initialize BLIS. */
     //bli_init_auto();
 
-    /* Convert typecast negative values of n to zero. */
-    if ( *n < 0 ) n_elem = ( dim_t )0;
-    else          n_elem = ( dim_t )(*n);
+    dim_t n0 = (dim_t)(*n);
+    double *x0 = x;
+    inc_t incx0 = (inc_t)(*incx);
 
     /*
       Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
       Return early when alpha pointer is NULL - BLIS exception
     */
-    if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha))
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
     {
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
         return;
     }
 
-    /* If the input increments are less than or equal to zero, return. */
-    if ( (*incx) <= 0 )
-    {
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      return ;
-    }
-    else
-    {
-      x0    = (x);
-      incx0 = ( inc_t )(*incx);
-    }
-
-     // Definition of function pointer
+    // Definition of function pointer
     dscalv_ker_ft scalv_ker_ptr;
 
     cntx_t *cntx = NULL;
 
+#ifdef BLIS_ENABLE_OPENMP
+    dim_t ST_THRESH = 30000;
+#endif
+
     // Query the architecture ID
-    arch_t arch_id_local = bli_arch_query_id();
+    arch_t id = bli_arch_query_id();
 
     // Pick the kernel based on the architecture ID
-    switch (arch_id_local)
+    switch (id)
     {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-        scalv_ker_ptr = bli_dscalv_zen_int_avx512;
+          // AVX512 Kernel
+          scalv_ker_ptr = bli_dscalv_zen_int_avx512;
   #ifdef BLIS_ENABLE_OPENMP
-        ST_THRESH = 30000;
+          ST_THRESH = 30000;
   #endif
-        break;
+          break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
 
           // AVX2 Kernel
           scalv_ker_ptr = bli_dscalv_zen_int10;
@@ -303,9 +272,9 @@ void dscal_blis_impl
 #endif
           break;
 
-      default:
+        default:
 
-          // Query the context
+          // For non-Zen architectures, query the context
           cntx = bli_gks_query_cntx();
 
           // Query the function pointer using the context
@@ -315,25 +284,28 @@ void dscal_blis_impl
 
 #ifdef BLIS_ENABLE_OPENMP
     /*
-      If the optimial number of threads is 1, the OpenMP and
-      'bli_nthreads_l1'overheads are avoided by calling the
+      If the optimal number of threads is 1, the OpenMP and
+      'bli_nthreads_l1' overheads are avoided by calling the
       function directly. This ensures that performance of dscalv
       does not drop for single  thread when OpenMP is enabled.
     */
-    if (n_elem <= ST_THRESH)
+    if (n0 <= ST_THRESH)
     {
 #endif
+        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
         scalv_ker_ptr
         (
           BLIS_NO_CONJUGATE,
-          n_elem,
+          -n0,
           (double *)alpha,
           x0, incx0,
           cntx
         );
 
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
         return;
 #ifdef BLIS_ENABLE_OPENMP
     }
@@ -354,14 +326,14 @@ void dscal_blis_impl
       BLIS_SCALV_KER,
       BLIS_DOUBLE,
       BLIS_DOUBLE,
-      arch_id_local,
-      n_elem,
+      id,
+      n0,
       &nt
     );
 
     _Pragma("omp parallel num_threads(nt)")
     {
-        dim_t start, end, length; 
+        dim_t start, end, length;
         thrinfo_t thrinfo_vec;
 
         // The block size is the minimum factor, whose multiple will ensure that only
@@ -383,7 +355,7 @@ void dscal_blis_impl
         bli_thread_range_sub
         (
           &thrinfo_vec,
-          n_elem,
+          n0,
           block_size,
           FALSE,
           &start,
@@ -396,22 +368,21 @@ void dscal_blis_impl
         double *x_thread_local = x0 + (start * incx0);
 
         // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
         scalv_ker_ptr
         (
           BLIS_NO_CONJUGATE,
-          length,
+          -length,
           (double *)alpha,
           x_thread_local, incx0,
           cntx
         );
     }
 
-    /* Finalize BLIS. */
-    // bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 #endif
-
 }
 #ifdef BLIS_ENABLE_BLAS
 void dscal_
@@ -421,7 +392,7 @@ void dscal_
        double*   x, const f77_int* incx
      )
 {
-  dscal_blis_impl( n, alpha, x, incx );
+    dscal_blis_impl( n, alpha, x, incx );
 }
 #endif
 void zdscal_blis_impl
@@ -433,19 +404,23 @@ void zdscal_blis_impl
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx );
-    dim_t  n_elem = (dim_t)(*n);
-    dcomplex* x0 = x;
-    inc_t  incx0 = (inc_t)(*incx);
+
     /* Initialize BLIS. */
     //bli_init_auto();
 
+    dim_t  n0 = (dim_t)(*n);
+    dcomplex* x0 = x;
+    inc_t  incx0 = (inc_t)(*incx);
+
     /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
     */
-    if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
+    if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
     {
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
         return;
     }
 
@@ -458,30 +433,34 @@ void zdscal_blis_impl
 
     cntx_t *cntx = NULL;
 
+#ifdef BLIS_ENABLE_OPENMP
+    dim_t ST_THRESH = 10000;
+#endif
+
     // Query the architecture ID
-    arch_t arch_id_local = bli_arch_query_id();
+    arch_t id = bli_arch_query_id();
 
     // Pick the kernel based on the architecture ID
-    switch (arch_id_local)
+    switch (id)
     {
-      case BLIS_ARCH_ZEN5:
-      case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN5:
+        case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
           // AVX512 Kernel
           scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
           break;
 #endif
-      case BLIS_ARCH_ZEN:
-      case BLIS_ARCH_ZEN2:
-      case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
 
           // AVX2 Kernel
           scalv_ker_ptr = bli_zdscalv_zen_int10;
           break;
 
-      default:
+        default:
 
-          // Query the context
+          // For non-Zen architectures, query the context
           cntx = bli_gks_query_cntx();
 
           // Query the function pointer using the context
@@ -489,6 +468,32 @@ void zdscal_blis_impl
     }
 
 #ifdef BLIS_ENABLE_OPENMP
+    /*
+      If the optimal number of threads is 1, the OpenMP and
+      'bli_nthreads_l1' overheads are avoided by calling the
+      function directly. This ensures that performance of dscalv
+      does not drop for single  thread when OpenMP is enabled.
+    */
+    if (n0 <= ST_THRESH)
+    {
+#endif
+        // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          -n0,
+          (dcomplex *)&alpha_cast,
+          x0, incx0,
+          cntx
+        );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
+        return;
+#ifdef BLIS_ENABLE_OPENMP
+    }
 
     /*
       Initializing the number of thread to one
@@ -506,33 +511,11 @@ void zdscal_blis_impl
       BLIS_SCALV_KER,
       BLIS_DCOMPLEX,
       BLIS_DOUBLE,
-      arch_id_local,
-      n_elem,
+      id,
+      n0,
       &nt
     );
 
-    /*
-      If the number of optimum threads is 1, the OpenMP overhead
-      is avoided by calling the function directly
-    */
-    if (nt == 1)
-    {
-#endif
-        scalv_ker_ptr
-        (
-          BLIS_NO_CONJUGATE,
-          n_elem,
-          (dcomplex *)&alpha_cast,
-          x0, incx0,
-          cntx
-        );
-
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-
-        return;
-#ifdef BLIS_ENABLE_OPENMP
-    }
-
     _Pragma("omp parallel num_threads(nt)")
     {
         dim_t start, length;
@@ -549,7 +532,7 @@ void zdscal_blis_impl
         */
         bli_thread_vector_partition
         (
-          n_elem,
+          n0,
           nt_use,
           &start, &length,
           thread_id
@@ -559,18 +542,21 @@ void zdscal_blis_impl
         dcomplex *x_thread_local = x0 + (start * incx0);
 
         // Invoke the function based on the kernel function pointer
+        // Pass size as negative to stipulate don't use SETV when alpha=0
         scalv_ker_ptr
         (
           BLIS_NO_CONJUGATE,
-          length,
+          -length,
           (dcomplex *)&alpha_cast,
           x_thread_local, incx0,
           cntx
         );
     }
-#endif
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
+#endif
 }
 #ifdef BLIS_ENABLE_BLAS
 void zdscal_
@@ -594,22 +580,27 @@ void cscal_blis_impl
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', (void *)alpha, *n, *incx);
 
+    /* Initialize BLIS. */
+    //bli_init_auto();
+
     dim_t n0 = (dim_t)(*n);
     scomplex *x0 = x;
     inc_t incx0 = (inc_t)(*incx);
 
     /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
     */
     if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
     {
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
         return;
     }
 
     // Definition of function pointer
-    cscalv_ker_ft scalv_fun_ptr;
+    cscalv_ker_ft scalv_ker_ptr;
 
     cntx_t* cntx = NULL;
 
@@ -622,40 +613,42 @@ void cscal_blis_impl
         case BLIS_ARCH_ZEN5:
         case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
-            // AVX512 Kernel
-            scalv_fun_ptr = bli_cscalv_zen_int_avx512;
-            break;
+          // AVX512 Kernel
+          scalv_ker_ptr = bli_cscalv_zen_int_avx512;
+          break;
 #endif
         case BLIS_ARCH_ZEN:
         case BLIS_ARCH_ZEN2:
         case BLIS_ARCH_ZEN3:
 
-            //   AVX2 Kernel
-            scalv_fun_ptr = bli_cscalv_zen_int;
-            break;
+          // AVX2 Kernel
+          scalv_ker_ptr = bli_cscalv_zen_int;
+          break;
 
         default:
 
-          // Query the context
+          // For non-Zen architectures, query the context
           cntx = bli_gks_query_cntx();
 
           // Query the function pointer using the context
-          scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
     }
 
-    // Call the function based on the function pointer assigned above
-    scalv_fun_ptr
+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
+    scalv_ker_ptr
     (
       BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
       (scomplex*) alpha,
       x0, incx0,
       cntx
     );
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
-
 #ifdef BLIS_ENABLE_BLAS
 void cscal_
      (
@@ -678,22 +671,27 @@ void zscal_blis_impl
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx);
 
+    /* Initialize BLIS. */
+    //bli_init_auto();
+
     dim_t n0 = (dim_t)(*n);
     dcomplex *x0 = x;
     inc_t incx0 = (inc_t)(*incx);
 
     /*
-        When n is zero or the alpha pointer passed is null
-        or the incx is zero or alpha is 1, return early.
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
     */
     if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
     {
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        //bli_finalize_auto();
         return;
     }
 
     // Definition of function pointer
-    zscalv_ker_ft scalv_fun_ptr;
+    zscalv_ker_ft scalv_ker_ptr;
 
     cntx_t* cntx = NULL;
 
@@ -707,7 +705,7 @@ void zscal_blis_impl
         case BLIS_ARCH_ZEN4:
 #if defined(BLIS_KERNELS_ZEN4)
           // AVX512 Kernel
-          scalv_fun_ptr = bli_zscalv_zen_int_avx512;
+          scalv_ker_ptr = bli_zscalv_zen_int_avx512;
           break;
 #endif
         case BLIS_ARCH_ZEN:
@@ -715,29 +713,32 @@ void zscal_blis_impl
         case BLIS_ARCH_ZEN3:
 
           // AVX2 Kernel
-          scalv_fun_ptr = bli_zscalv_zen_int;
+          scalv_ker_ptr = bli_zscalv_zen_int;
           break;
 
         default:
 
-          // Query the context
+          // For non-Zen architectures, query the context
           cntx = bli_gks_query_cntx();
 
           // Query the function pointer using the context
-          scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
     }
 
-    // Call the function based on the function pointer assigned above
-    scalv_fun_ptr
+    // Invoke the function based on the kernel function pointer
+    // Pass size as negative to stipulate don't use SETV when alpha=0
+    scalv_ker_ptr
     (
       BLIS_NO_CONJUGATE,
-      n0,
+      -n0,
       (dcomplex*) alpha,
       x0, incx0,
       cntx
     );
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    /* Finalize BLIS. */
+    //bli_finalize_auto();
 }
 #ifdef BLIS_ENABLE_BLAS
 void zscal_
@@ -751,4 +752,4 @@ void zscal_
 }
 #endif
 
-GENTFUNCSCAL( scomplex, float, c, s, scal, scalv )
+GENTFUNCSCAL( scomplex, float, c, s, s, scal, scalv )
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index fa3ea5201..940f0f2e8 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -174,12 +174,12 @@ GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname )
 
 #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
 \
-GENTFUNCSCAL( float,    float,    s,  , blasname, blisname ) \
-GENTFUNCSCAL( double,   double,   d,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
+GENTFUNCSCAL( float,    float,    s,  , s, blasname, blisname ) \
+GENTFUNCSCAL( double,   double,   d,  , d, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, scomplex, c,  , c, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, dcomplex, z,  , z, blasname, blisname ) \
+GENTFUNCSCAL( scomplex, float,    c, s, s, blasname, blisname ) \
+GENTFUNCSCAL( dcomplex, double,   z, d, d, blasname, blisname )
 
 // --GEMMT specific kernels ----------------------------------------------------
 
diff --git a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
index ee77340d5..f259892eb 100644
--- a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
@@ -36,10 +36,10 @@
 #include "test_scalv.h"
 
 class cscalvGeneric :
-        public ::testing::TestWithParam<std::tuple<char,
-                                                   gtint_t,
-                                                   gtint_t,
-                                                   scomplex>> {};
+        public ::testing::TestWithParam<std::tuple<char,            // conj_alpha
+                                                   gtint_t,         // n
+                                                   gtint_t,         // incx
+                                                   scomplex>> {};   // alpha
 
 
 // Tests using random integers as vector elements.
@@ -78,42 +78,140 @@ TEST_P( cscalvGeneric, API )
     test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }
 
-// Black box testing for generic and main use of cscal.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        Blackbox,
+        unitPositiveIncrementSmall,
         cscalvGeneric,
         ::testing::Combine(
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),                                                               // n: use x, c: use conj(x)
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0})      // alpha
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
         ),
         (::scalvGenericPrint<scomplex, scomplex>())
     );
 
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
 
-// Test for non-unit increments.
-// Only test very few cases as sanity check.
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{-5.1, -7.3},
+                                scomplex{ 1.0,  1.0},
+                                scomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        cscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                scomplex{ 0.0,  0.0}
+            )
+        ),
+        (::scalvGenericPrint<scomplex, scomplex>())
+    );
+#endif
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
 // We can modify the values using implementantion details.
 INSTANTIATE_TEST_SUITE_P(
-        NonUnitPositiveIncrements,
+        conjalpha,
         cscalvGeneric,
         ::testing::Combine(
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),                                                               // n: use x, c: use conj(x)
-            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
-            ::testing::Values(scomplex{4.0, 3.1})                            // alpha
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(scomplex{ 7.3,  5.1})                          // alpha
         ),
         (::scalvGenericPrint<scomplex, scomplex>())
     );
+#endif
 
 #ifndef TEST_BLIS_TYPED
 // Test for negative increments.
@@ -126,7 +224,7 @@ INSTANTIATE_TEST_SUITE_P(
             ::testing::Values('n'),                                          // n: use x, c: use conj(x)
             ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
             ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
-            ::testing::Values(scomplex{4.0, 3.1})                            // alpha
+            ::testing::Values(scomplex{ 7.3,  5.1})                          // alpha
         ),
         (::scalvGenericPrint<scomplex, scomplex>())
     );
diff --git a/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
new file mode 100644
index 000000000..d09afe843
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
@@ -0,0 +1,219 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class csscalvGeneric :
+        public ::testing::TestWithParam<std::tuple<char,        // conj_alpha
+                                                   gtint_t,     // n
+                                                   gtint_t,     // incx
+                                                   float>> {}; // alpha
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csscalvGeneric);
+
+// Tests using random integers as vector elements.
+TEST_P( csscalvGeneric, API )
+{
+    using T = scomplex;
+    using U = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conj(alpha) will be used:
+    char conj_alpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // alpha
+    U alpha = std::get<3>(GetParam());
+
+    // Set the threshold for the errors:
+    // Check gtestsuite scalv.h or netlib source code for reminder of the
+    // functionality from which we estimate operation count per element
+    // of output, and hence the multipler for epsilon.
+    // No adjustment applied yet for complex data.
+    double thresh;
+    if (n == 0)
+        thresh = 0.0;
+    else if (alpha == testinghelpers::ZERO<U>() || alpha == testinghelpers::ONE<U>())
+        thresh = 0.0;
+    else
+        thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_scalv<T, U>( conj_alpha, n, incx, alpha, thresh );
+}
+
+// bli_csscal not present in BLIS
+#ifndef TEST_BLIS_TYPED
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementSmall,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        csscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        csscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+#endif // not TEST_BLIS_TYPED
+
+
+
+
+
+
diff --git a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
index 1ca853db2..0f5f2f203 100644
--- a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
@@ -79,20 +79,41 @@ TEST_P( dscalvGeneric, API )
 
 // Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
         dscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
             ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
             // incx: stride of x vector.
             ::testing::Values(
                                 gtint_t(1)
             ),
             // alpha: value of scalar.
             ::testing::Values(
-                                double( 0.0),
                                 double( 7.0),
                                 double(-3.0)
             )
@@ -101,21 +122,20 @@ INSTANTIATE_TEST_SUITE_P(
     );
 
 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        nonUnitPositiveIncrementSmall,
         dscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
             ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
             // incx: stride of x vector.
             ::testing::Values(
                                 gtint_t(2),
-                                gtint_t(3)
+                                gtint_t(41)
             ),
             // alpha: value of scalar.
             ::testing::Values(
-                                double( 0.0),
                                 double( 7.0),
                                 double(-3.0)
             )
@@ -123,6 +143,54 @@ INSTANTIATE_TEST_SUITE_P(
         (::scalvGenericPrint<double, double>())
     );
 
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #ifdef TEST_BLIS_TYPED
 // Test when conjugate of x is used as an argument. This option is BLIS-api specific.
 // Only test very few cases as sanity check since conj(x) = x for real types.
@@ -140,6 +208,23 @@ INSTANTIATE_TEST_SUITE_P(
     );
 #endif
 
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        dscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
 INSTANTIATE_TEST_SUITE_P(
         AOCLDynamic,
@@ -151,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(
             ::testing::Values(
                                gtint_t(   30000),     // nt_ideal = 1
                                gtint_t(  100000),     // nt_ideal = 2
+                               gtint_t(  486919),     // nt_ideal = 8
                                gtint_t(  500000),     // nt_ideal = 8
                                gtint_t( 2500000),     // nt_ideal = 12
                                gtint_t( 4000000),     // nt_ideal = 16
@@ -160,7 +246,8 @@ INSTANTIATE_TEST_SUITE_P(
             ),
             // incx: stride of x vector.
             ::testing::Values(
-                                gtint_t(1)
+                                gtint_t(1),
+                                gtint_t(3)
             ),
             // alpha: value of scalar.
             ::testing::Values(
@@ -169,4 +256,34 @@ INSTANTIATE_TEST_SUITE_P(
         ),
         (::scalvGenericPrint<double, double>())
     );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamicAlphaZero,
+        dscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(  89),     // nt_ideal = 8
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t(25000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(3)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
 #endif
+
+#endif // BLIS_ENABLE_OPENMP && AOCL_DYNAMIC
diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
deleted file mode 100644
index 8bf16f8dc..000000000
--- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <gtest/gtest.h>
-#include "test_scalv.h"
-
-template <typename T>
-class scalv_EIC : public ::testing::Test {};
-typedef ::testing::Types<float, double> TypeParam;
-TYPED_TEST_SUITE(scalv_EIC, TypeParam);
-
-TYPED_TEST(scalv_EIC, zero_alpha_x_fp)
-{
-    using T = TypeParam;
-    gtint_t n = 10, incx = 1;
-    std::vector<T> x(n);
-    // Initialize x with random numbers.
-    testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
-    std::vector<T> x_ref(x);
-    T alpha = T{0};
-
-    testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
-    //----------------------------------------------------------
-    //                  Call BLIS function.
-    //----------------------------------------------------------
-    scalv<T>('n', n, alpha, x.data(), incx);
-
-    //----------------------------------------------------------
-    //              Compute component-wise error.
-    //----------------------------------------------------------
-    // Set the threshold for the errors:
-    // Check gtestsuite scalv.h or netlib source code for reminder of the
-    // functionality from which we estimate operation count per element
-    // of output, and hence the multipler for epsilon.
-    double thresh;
-    if (n == 0)
-        thresh = 0.0;
-    else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
-        thresh = 0.0;
-    else
-        thresh = testinghelpers::getEpsilon<T>();
-
-    //----------------------------------------------------------
-    //     Call generic test body using those parameters
-    //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
-}
-
-TYPED_TEST(scalv_EIC, zero_alpha_x_inf)
-{
-    using T = TypeParam;
-    gtint_t n = 10, incx = 1;
-    std::vector<T> x(n);
-    // Initialize x with random numbers.
-    testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
-    x[3] = 1.0/0.0;
-    std::vector<T> x_ref(x);
-    T alpha = T{0};
-    testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
-
-    //----------------------------------------------------------
-    //                  Call BLIS function.
-    //----------------------------------------------------------
-    scalv<T>('n', n, alpha, x.data(), incx);
-
-    //----------------------------------------------------------
-    //              Compute component-wise error.
-    //----------------------------------------------------------
-    // Set the threshold for the errors:
-    // Check gtestsuite scalv.h or netlib source code for reminder of the
-    // functionality from which we estimate operation count per element
-    // of output, and hence the multipler for epsilon.
-    // No adjustment applied yet for complex data.
-    double thresh;
-    if (n == 0)
-        thresh = 0.0;
-    else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
-        thresh = 0.0;
-    else
-        thresh = testinghelpers::getEpsilon<T>();
-
-    //----------------------------------------------------------
-    //     Call generic test body using those parameters
-    //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
-}
diff --git a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
index 12187bcd4..c45bb8337 100644
--- a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
@@ -63,7 +63,7 @@ TEST_P( sscalvGeneric, API )
     // Check gtestsuite scalv.h or netlib source code for reminder of the
     // functionality from which we estimate operation count per element
     // of output, and hence the multipler for epsilon.
-    double thresh;
+    float thresh;
     if (n == 0)
         thresh = 0.0;
     else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
@@ -77,19 +77,120 @@ TEST_P( sscalvGeneric, API )
     test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }
 
-// Black box testing for generic and main use of sscal.
+// Black box testing for generic use of sscal.
 INSTANTIATE_TEST_SUITE_P(
-        Blackbox,
+        unitPositiveIncrementSmall,
         sscalvGeneric,
         ::testing::Combine(
-            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(float(3.0), float(-5.0))                       // alpha
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
         ),
         (::scalvGenericPrint<float, float>())
     );
 
+// Black box testing for generic use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        unitPositiveIncrementLarge,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(17), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 7.0),
+                                float(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        sscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                float( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<float, float>())
+    );
+#endif
+
 #ifdef TEST_BLIS_TYPED
 // Test when conjugate of x is used as an argument. This option is BLIS-api specific.
 // Only test very few cases as sanity check since conj(x) = x for real types.
@@ -101,28 +202,12 @@ INSTANTIATE_TEST_SUITE_P(
             ::testing::Values('c'),                                          // c: use conjugate
             ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
             ::testing::Values(gtint_t(1)),                                   // stride size for x
-            ::testing::Values(float(9.0))                                    // alpha
+            ::testing::Values(float(-3.0))                                  // alpha
         ),
         (::scalvGenericPrint<float, float>())
     );
 #endif
 
-// Test for non-unit increments.
-// Only test very few cases as sanity check.
-// We can modify the values using implementantion details.
-INSTANTIATE_TEST_SUITE_P(
-        NonUnitPositiveIncrements,
-        sscalvGeneric,
-        ::testing::Combine(
-            ::testing::Values('n'),                                          // n: use x
-            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
-            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
-            ::testing::Values(float(2.0))                                    // alpha
-        ),
-        (::scalvGenericPrint<float, float>())
-    );
-
-
 #ifndef TEST_BLIS_TYPED
 // Test for negative increments.
 // Only test very few cases as sanity check.
diff --git a/gtestsuite/testsuite/level1/scalv/test_scalv.h b/gtestsuite/testsuite/level1/scalv/test_scalv.h
index c045f6fcc..e4663da97 100644
--- a/gtestsuite/testsuite/level1/scalv/test_scalv.h
+++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h
@@ -48,6 +48,8 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
     //        Initialize vector with random numbers.
     //----------------------------------------------------------
     std::vector<T> x = testinghelpers::get_random_vector<T>( -10, 10, n, incx );
+    if (alpha == testinghelpers::ZERO<U>())
+        testinghelpers::set_vector( n, incx, x.data(), testinghelpers::aocl_extreme<T>() );
 
     //----------------------------------------------------------
     //    Call reference implementation to get ref results.
@@ -64,7 +66,7 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
     //----------------------------------------------------------
     //              Compute component-wise error.
     //----------------------------------------------------------
-    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh );
+    computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
 }
 
 /**
diff --git a/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp
index 59d875bda..8e2545597 100644
--- a/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp
@@ -82,59 +82,187 @@ TEST_P( zdscalvGeneric, API )
 
 // bli_zdscal not present in BLIS
 #ifndef TEST_BLIS_TYPED
-// Black box testing for zdscal.
-// Tests with unit-positive increment.
+
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
         zdscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-                            , 'c'       // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
             // incx: stride of x vector.
-            ::testing::Values(gtint_t(1)),
+            ::testing::Values(
+                                gtint_t(1)
+            ),
             // alpha: value of scalar.
             ::testing::Values(
-                                double(-5.1),
-                                double( 0.0),
-                                double( 7.3)
+                                double( 7.0),
+                                double(-3.0)
             )
         ),
-        (::scalvGenericPrint<dcomplex, double>())
+        (::scalvGenericPrint<double, double>())
     );
 
-
-// Tests for non-unit increments.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        unitPositiveIncrementLarge,
         zdscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-                            , 'c'       // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
             // incx: stride of x vector.
             ::testing::Values(
                                 gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 7.0),
+                                double(-3.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zdscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3)                                             // alpha
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+
+#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamic,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(   10000),     // nt_ideal = 1
+                               gtint_t(   20000),     // nt_ideal = 4
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t( 1000000),     // nt_ideal = 8
+                               gtint_t( 2500000),     // nt_ideal = 12
+                               gtint_t( 5000000),     // nt_ideal = 32
+                               gtint_t( 7000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
                                 gtint_t(3)
             ),
             // alpha: value of scalar.
             ::testing::Values(
-                                double(-5.1),
-                                double( 0.0),
-                                double( 7.3)
+                                double( 7.0)
             )
         ),
-        (::scalvGenericPrint<dcomplex, double>())
+        (::scalvGenericPrint<double, double>())
     );
 
+INSTANTIATE_TEST_SUITE_P(
+        AOCLDynamicAlphaZero,
+        zdscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(
+                               gtint_t(  486919),     // nt_ideal = 8
+                               gtint_t( 7000000)      // nt_ideal = max_available
+            ),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(3)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                double( 0.0)
+            )
+        ),
+        (::scalvGenericPrint<double, double>())
+    );
+#endif
+
 #endif // not TEST_BLIS_TYPED
diff --git a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
index bf7182d83..20635564b 100644
--- a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
+++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
@@ -78,26 +78,22 @@ TEST_P( zscalvGeneric, API )
     test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
 }
 
-// Black box testing for zscal.
-// Tests with unit-positive increment.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        unitPositiveIncrement,
+        unitPositiveIncrementSmall,
         zscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
             // incx: stride of x vector.
-            ::testing::Values(gtint_t(1)),
+            ::testing::Values(
+                                gtint_t(1)
+            ),
             // alpha: value of scalar.
             ::testing::Values(
                                 dcomplex{-5.1, -7.3},
-                                dcomplex{ 0.0,  0.0},
                                 dcomplex{ 1.0,  1.0},
                                 dcomplex{ 7.3,  5.1}
             )
@@ -105,32 +101,131 @@ INSTANTIATE_TEST_SUITE_P(
         (::scalvGenericPrint<dcomplex, dcomplex>())
     );
 
-
-// Test for non-unit increments.
+// Black box testing for generic use of dscal.
 INSTANTIATE_TEST_SUITE_P(
-        nonUnitPositiveIncrement,
+        unitPositiveIncrementLarge,
         zscalvGeneric,
         ::testing::Combine(
             // conj(alpha): uses n (no_conjugate) since it is real.
-            ::testing::Values('n'
-#ifdef TEST_BLIS_TYPED
-            , 'c'                                                            // this option is BLIS-api specific.
-#endif
-            ),
+            ::testing::Values('n'),
             // m: size of vector.
-            ::testing::Range(gtint_t(10), gtint_t(101), 10),
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{-5.1, -7.3},
+                                dcomplex{ 1.0,  1.0},
+                                dcomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementSmall,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(9), 1),
             // incx: stride of x vector.
             ::testing::Values(
                                 gtint_t(2),
-                                gtint_t(3)
+                                gtint_t(41)
             ),
             // alpha: value of scalar.
             ::testing::Values(
                                 dcomplex{-5.1, -7.3},
-                                dcomplex{ 0.0,  0.0},
                                 dcomplex{ 1.0,  1.0},
                                 dcomplex{ 7.3,  5.1}
             )
         ),
         (::scalvGenericPrint<dcomplex, dcomplex>())
     );
+
+INSTANTIATE_TEST_SUITE_P(
+        nonUnitPositiveIncrementLarge,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{-5.1, -7.3},
+                                dcomplex{ 1.0,  1.0},
+                                dcomplex{ 7.3,  5.1}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+
+#ifndef TEST_BLIS_TYPED
+// alpha=0 testing only for BLAS and CBLAS as
+// BLIS uses setv and won't propagate Inf and NaNs
+INSTANTIATE_TEST_SUITE_P(
+        alphaZero,
+        zscalvGeneric,
+        ::testing::Combine(
+            // conj(alpha): uses n (no_conjugate) since it is real.
+            ::testing::Values('n'),
+            // m: size of vector.
+            ::testing::Range(gtint_t(1), gtint_t(101), 1),
+            // incx: stride of x vector.
+            ::testing::Values(
+                                gtint_t(1),
+                                gtint_t(2),
+                                gtint_t(41)
+            ),
+            // alpha: value of scalar.
+            ::testing::Values(
+                                dcomplex{ 0.0,  0.0}
+            )
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        conjalpha,
+        zscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(dcomplex{ 7.3,  5.1})                          // alpha
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zscalvGeneric,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(dcomplex{ 7.3,  5.1})                          // alpha
+        ),
+        (::scalvGenericPrint<dcomplex, dcomplex>())
+    );
+#endif
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index fa337c247..34d6b161c 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -80,9 +80,11 @@ void bli_sscalv_zen_int
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
 
 	// If alpha is zero, use setv (in case y contains NaN or Inf).
-	if ( PASTEMAC(s,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
 	{
 		float*       zero = bli_s0;
+		if (cntx == NULL) cntx = bli_gks_query_cntx();
 		ssetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
@@ -96,10 +98,12 @@ void bli_sscalv_zen_int
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	// Use the unrolling factor and the number of elements per register
 	// to compute the number of vectorized and leftover iterations.
-	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
-	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+	n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
 
 	// If there is anything that would interfere with our use of contiguous
 	// vector loads/stores, override n_viter and n_left to use scalar code
@@ -107,7 +111,7 @@ void bli_sscalv_zen_int
 	if ( incx != 1 )
 	{
 		n_viter = 0;
-		n_left  = n;
+		n_left  = n0;
 	}
 
 	// Initialize local pointers.
@@ -178,10 +182,11 @@ void bli_dscalv_zen_int
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
 
-	// If alpha is zero, use setv (in case y contains NaN or Inf).
-	if ( PASTEMAC(d,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
 	{
 		double*      zero = bli_d0;
+		if (cntx == NULL) cntx = bli_gks_query_cntx();
 		dsetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
@@ -195,10 +200,12 @@ void bli_dscalv_zen_int
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	// Use the unrolling factor and the number of elements per register
 	// to compute the number of vectorized and leftover iterations.
-	n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
-	n_left  = ( n ) % ( n_elem_per_reg * n_iter_unroll );
+	n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
+	n_left  = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
 
 	// If there is anything that would interfere with our use of contiguous
 	// vector loads/stores, override n_viter and n_left to use scalar code
@@ -206,7 +213,7 @@ void bli_dscalv_zen_int
 	if ( incx != 1 )
 	{
 		n_viter = 0;
-		n_left  = n;
+		n_left  = n0;
 	}
 
 	// Initialize local pointers.
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index 463ab9ae0..ab5e46af0 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -60,8 +60,8 @@ void bli_sscalv_zen_int10
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
 
-	// If alpha is zero, use setv.
-	if ( PASTEMAC(s,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
 	{
 		float* zero = bli_s0;
 		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -78,6 +78,8 @@ void bli_sscalv_zen_int10
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	// Initialize local pointers.
 	x0 = x;
 
@@ -88,11 +90,11 @@ void bli_sscalv_zen_int10
 		dim_t option;
 
 		// Unroll and the loop used is picked based on the input size.
-		if( n < 300)
+		if( n0 < 300)
 		{
 			option = 2;
 		}
-		else if( n < 500)
+		else if( n0 < 500)
 		{
 			option = 1;
 		}
@@ -105,7 +107,7 @@ void bli_sscalv_zen_int10
 		{
 			case 0:
 
-				for ( ; (i + 127) < n; i += 128 )
+				for ( ; (i + 127) < n0; i += 128 )
 				{
 					//Load the input values
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -175,7 +177,7 @@ void bli_sscalv_zen_int10
 	
 			case 1 :
 
-				for ( ; (i + 95) < n; i += 96 )
+				for ( ; (i + 95) < n0; i += 96 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -227,7 +229,7 @@ void bli_sscalv_zen_int10
 
 			case 2:
 		
-				for ( ; (i + 47) < n; i += 48 )
+				for ( ; (i + 47) < n0; i += 48 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -256,7 +258,7 @@ void bli_sscalv_zen_int10
 					x0 += 6*n_elem_per_reg;
 				}
 
-				for ( ; (i + 23) < n; i += 24 )
+				for ( ; (i + 23) < n0; i += 24 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -273,7 +275,7 @@ void bli_sscalv_zen_int10
 					x0 += 3*n_elem_per_reg;
 				}
 
-				for ( ; (i + 7) < n; i += 8 )
+				for ( ; (i + 7) < n0; i += 8 )
 				{
 					xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
 
@@ -284,7 +286,7 @@ void bli_sscalv_zen_int10
 					x0 += 1*n_elem_per_reg;
 				}
 
-				for ( ; (i + 0) < n; i += 1 )
+				for ( ; (i + 0) < n0; i += 1 )
 				{
 					*x0 *= *alpha;
 
@@ -296,7 +298,7 @@ void bli_sscalv_zen_int10
 	{
 		const float alphac = *alpha;
 
-		for ( ; i < n; ++i )
+		for ( ; i < n0; ++i )
 		{
 			*x0 *= alphac;
 
@@ -329,8 +331,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 	// If the vector dimension is zero, or if alpha is unit, return early.
 	if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
 
-	// If alpha is zero, use setv.
-	if ( PASTEMAC(d,eq0)( *alpha ) )
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
 	{
 		double* zero = bli_d0;
 		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -348,6 +350,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	// Initialize local pointers.
 	x0 = x;
 
@@ -358,11 +362,11 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		dim_t option;		
 
 		// Unroll and the loop used is picked based on the input size.
-		if(n < 200)
+		if(n0 < 200)
 		{
 			option = 2;
 		}
-		else if(n < 500)
+		else if(n0 < 500)
 		{
 			option = 1;
 		}
@@ -375,7 +379,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 		{
 			case 0:
 
-				for (; (i + 63) < n; i += 64 )
+				for (; (i + 63) < n0; i += 64 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -440,7 +444,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 16*n_elem_per_reg;
 				}
 
-				for (; (i + 47) < n; i += 48 )
+				for (; (i + 47) < n0; i += 48 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -492,7 +496,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 
 			case 1:
 
-				for (; (i + 31) < n; i += 32 )
+				for (; (i + 31) < n0; i += 32 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -529,7 +533,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 				
 			case 2:
 
-				for ( ; (i + 11) < n; i += 12 )
+				for ( ; (i + 11) < n0; i += 12 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 					xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -546,7 +550,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 3*n_elem_per_reg;
 				}
 
-				for ( ; (i + 3) < n; i += 4 )
+				for ( ; (i + 3) < n0; i += 4 )
 				{
 					xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
 
@@ -557,7 +561,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 					x0 += 1*n_elem_per_reg;
 				}
 
-				for ( ; (i + 0) < n; i += 1 )
+				for ( ; (i + 0) < n0; i += 1 )
 				{
 					*x0 *= *alpha;
 
@@ -569,7 +573,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
 	{
 		const double alphac = *alpha;
 
-		for ( ; i < n; ++i )
+		for ( ; i < n0; ++i )
 		{
 			*x0 *= alphac;
 
@@ -587,6 +591,30 @@ void bli_zdscalv_zen_int10
        cntx_t* restrict cntx
      )
 {
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha )) return;
+
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
+	{
+		// Expert interface of setv is invoked when alpha is zero
+		dcomplex *zero = bli_z0;
+
+		/* When alpha is zero all the element in x are set to zero */
+		PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
+		(
+			BLIS_NO_CONJUGATE,
+			n,
+			zero,
+			x, incx,
+			cntx,
+			NULL);
+
+		return;
+	}
+
+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	const dim_t n_elem_per_reg = 4;    // number of elements per register
 
@@ -607,7 +635,7 @@ void bli_zdscalv_zen_int10
 
 		alphav = _mm256_broadcast_sd( &alphac );
 
-		for ( ; ( i + 29 ) < n; i += 30 )
+		for ( ; ( i + 29 ) < n0; i += 30 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -660,7 +688,7 @@ void bli_zdscalv_zen_int10
 			x0 += 15 * n_elem_per_reg;
 		}
 
-		for ( ; ( i + 23 ) < n; i += 24 )
+		for ( ; ( i + 23 ) < n0; i += 24 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -704,7 +732,7 @@ void bli_zdscalv_zen_int10
 			x0 += 12 * n_elem_per_reg;
 		}
 
-		for ( ; ( i + 15 ) < n; i += 16 )
+		for ( ; ( i + 15 ) < n0; i += 16 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -736,7 +764,7 @@ void bli_zdscalv_zen_int10
 			x0 += 8 * n_elem_per_reg;
 		}
 
-		for ( ; ( i + 7 ) < n; i += 8 )
+		for ( ; ( i + 7 ) < n0; i += 8 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -756,7 +784,7 @@ void bli_zdscalv_zen_int10
 			x0 += 4 * n_elem_per_reg;
 		}
 
-		for ( ; ( i + 3 ) < n; i += 4 )
+		for ( ; ( i + 3 ) < n0; i += 4 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 			xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -770,7 +798,7 @@ void bli_zdscalv_zen_int10
 			x0 += 2 * n_elem_per_reg;
 		}
 
-		for ( ; ( i + 1 ) < n; i += 2 )
+		for ( ; ( i + 1 ) < n0; i += 2 )
 		{
 			xv[0] = _mm256_loadu_pd( x0 );
 
@@ -795,7 +823,7 @@ void bli_zdscalv_zen_int10
 
 	alpha_reg = _mm_set1_pd((*alpha).real);
 
-	for (; i < n; ++i)
+	for (; i < n0; ++i)
 	{
 		x_vec = _mm_loadu_pd(x0);
 
@@ -816,24 +844,14 @@ void bli_cscalv_zen_int
 	cntx_t* restrict cntx
 	)
 {
-	/*
-		Undefined behaviour
-		-------------------
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
 
-		1. This layer is not BLAS complaint and the kernel results in
-		undefined behaviour when n <= 0 and incx <= 1. The expectation
-		is that the application/higher-layer invoking this layer should
-		the arg checks.
-	*/
-	// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-	//	return;
-
-	// To Do: This call to SETV needs to be removed for BLAS compliance
-	// Currently removing this is resulting in ZHERK failures
-	if (PASTEMAC(c, eq0)(*alpha))
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
 	{
 		// Expert interface of setv is invoked when alpha is zero
-		scomplex *zero = PASTEMAC(c, 0);
+		scomplex *zero = bli_c0;
 
 		/* When alpha is zero all the element in x are set to zero */
 		PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -848,6 +866,8 @@ void bli_cscalv_zen_int
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	scomplex alpha_conj;
 	float *x0 = (float *)x;
@@ -897,7 +917,7 @@ void bli_cscalv_zen_int
 			and then store
 		*/
 
-		for (; (i + 15) < n; i += 16)
+		for (; (i + 15) < n0; i += 16)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);
 			x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -927,7 +947,7 @@ void bli_cscalv_zen_int
 			x0 += 4 * n_elem_per_reg;
 		}
 
-		for (; (i + 7) < n; i += 8)
+		for (; (i + 7) < n0; i += 8)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);
 			x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -947,7 +967,7 @@ void bli_cscalv_zen_int
 			x0 += 2 * n_elem_per_reg;
 		}
 
-		for (; (i + 3) < n; i += 4)
+		for (; (i + 3) < n0; i += 4)
 		{
 			x_vec_ymm[0] = _mm256_loadu_ps(x0);
 
@@ -969,7 +989,7 @@ void bli_cscalv_zen_int
 		_mm256_zeroupper();
 	}
 
-	for (; i < n; i++)
+	for (; i < n0; i++)
 	{
 		float x_real, x_imag;
 		x_real = real * (*x0) - imag * (*(x0 + 1));
@@ -991,24 +1011,14 @@ void bli_zscalv_zen_int
 	cntx_t* restrict cntx
 	)
 {
-	/*
-		Undefined behaviour
-		-------------------
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
 
-		1. This layer is not BLAS complaint and the kernel results in
-		undefined behaviour when n <= 0 and incx <= 1. The expectation
-		is that the application/higher-layer invoking this layer should
-		the arg checks.
-	*/
-	// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-	//	return;
-
-	// To Do: This call to SETV needs to be removed for BLAS compliance
-	// Currently removing this is resulting in ZHERK failures
-	if (PASTEMAC(z, eq0)(*alpha))
+	// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+	if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
 	{
 		// Expert interface of setv is invoked when alpha is zero
-		dcomplex *zero = PASTEMAC(z, 0);
+		dcomplex *zero = bli_z0;
 
 		/* When alpha is zero all the element in x are set to zero */
 		PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -1023,6 +1033,8 @@ void bli_zscalv_zen_int
 		return;
 	}
 
+	dim_t n0 = bli_abs(n);
+
 	dim_t i = 0;
 	dcomplex alpha_conj;
 	double *x0 = (double *)x;
@@ -1033,8 +1045,8 @@ void bli_zscalv_zen_int
 	double real = alpha_conj.real;
 	double imag = alpha_conj.imag;
 
-	/*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/
-	if (incx == 1 && n >= 2)
+	/*When incx is 1 and n0 >= 2 it is possible to use AVX2 instructions*/
+	if (incx == 1 && n0 >= 2)
 	{
 		dim_t const n_elem_per_reg = 4;
 
@@ -1072,7 +1084,7 @@ void bli_zscalv_zen_int
 			and then store
 		*/
 
-		for (; (i + 7) < n; i += 8)
+		for (; (i + 7) < n0; i += 8)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);
 			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1106,7 +1118,7 @@ void bli_zscalv_zen_int
 			x0 += 4 * n_elem_per_reg;
 		}
 
-		for (; (i + 3) < n; i += 4)
+		for (; (i + 3) < n0; i += 4)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);
 			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1126,7 +1138,7 @@ void bli_zscalv_zen_int
 			x0 += 2 * n_elem_per_reg;
 		}
 
-		for (; (i + 1) < n; i += 2)
+		for (; (i + 1) < n0; i += 2)
 		{
 			x_vec_ymm[0] = _mm256_loadu_pd(x0);
 
@@ -1155,7 +1167,7 @@ void bli_zscalv_zen_int
 	alpha_real_xmm = _mm_set1_pd(real);
 	alpha_imag_xmm = _mm_set1_pd(imag);
 
-	for (; i < n; i++)
+	for (; i < n0; i++)
 	{
 		x_vec_xmm = _mm_loadu_pd(x0);
 
diff --git a/kernels/zen4/1/bli_scalv_zen_int_avx512.c b/kernels/zen4/1/bli_scalv_zen_int_avx512.c
index 4d9a05794..a2143a524 100644
--- a/kernels/zen4/1/bli_scalv_zen_int_avx512.c
+++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c
@@ -61,13 +61,14 @@
     Deviation from BLAS
     --------------------
 
-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.
 
     Undefined behaviour
     -------------------
 
-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_sscalv_zen_int_avx512
         (
@@ -78,6 +79,30 @@ void bli_sscalv_zen_int_avx512
           cntx_t *restrict cntx
         )
 {
+    // If the vector dimension is zero, or if alpha is unit, return early.
+    if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
+
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
+    {
+        float *zero = bli_s0;
+        if (cntx == NULL) cntx = bli_gks_query_cntx();
+        ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SETV_KER, cntx);
+
+        f
+        (
+          BLIS_NO_CONJUGATE,
+          n,
+          zero,
+          x, incx,
+          cntx
+        );
+
+        return;
+    }
+
+    dim_t n0 = bli_abs(n);
+
     dim_t i = 0;
     float *restrict x0 = x;
 
@@ -89,7 +114,7 @@ void bli_sscalv_zen_int_avx512
         __m512 xv[8], alphav;
         alphav = _mm512_set1_ps(*alpha);
 
-        for (i = 0; (i + 127) < n; i += 128)
+        for (i = 0; (i + 127) < n0; i += 128)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -125,7 +150,7 @@ void bli_sscalv_zen_int_avx512
             x0 += 8 * n_elem_per_reg;
         }
 
-        for (; (i + 63) < n; i += 64)
+        for (; (i + 63) < n0; i += 64)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -147,7 +172,7 @@ void bli_sscalv_zen_int_avx512
             x0 += 4 * n_elem_per_reg;
         }
 
-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -163,7 +188,7 @@ void bli_sscalv_zen_int_avx512
             x0 += 2 * n_elem_per_reg;
         }
 
-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -176,7 +201,7 @@ void bli_sscalv_zen_int_avx512
             x0 += n_elem_per_reg;
         }
 
-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
         {
             // Loading the input values
             __m256 x_vec = _mm256_loadu_ps(x0);
@@ -198,7 +223,7 @@ void bli_sscalv_zen_int_avx512
         */
         _mm256_zeroupper();
 
-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
         {
             // Loading the input values
             __m128 x_vec = _mm_loadu_ps(x0);
@@ -215,7 +240,7 @@ void bli_sscalv_zen_int_avx512
 
     const float alphac = *alpha;
 
-    for (; i < n; ++i)
+    for (; i < n0; ++i)
     {
         *x0 *= alphac;
 
@@ -252,13 +277,14 @@ void bli_sscalv_zen_int_avx512
     Deviation from BLAS
     --------------------
 
-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.
 
     Undefined behaviour
     -------------------
 
-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
         (
@@ -270,11 +296,10 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
         )
 {
     // If the vector dimension is zero, or if alpha is unit, return early.
-    if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha))
-        return;
+    if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
 
-    // If alpha is zero, use setv.
-    if (PASTEMAC(d, eq0)(*alpha))
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
     {
         double *zero = bli_d0;
         if (cntx == NULL) cntx = bli_gks_query_cntx();
@@ -292,6 +317,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
         return;
     }
 
+    dim_t n0 = bli_abs(n);
+
     dim_t i = 0;
     double *restrict x0;
 
@@ -307,7 +334,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
         alphav = _mm512_set1_pd(*alpha);
         __m512d xv[8];
 
-        for (i = 0; (i + 63) < n; i += 64)
+        for (i = 0; (i + 63) < n0; i += 64)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -343,7 +370,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
             x0 += 8 * n_elem_per_reg;
         }
 
-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -365,7 +392,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
             x0 += 4 * n_elem_per_reg;
         }
 
-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -381,7 +408,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
             x0 += 2 * n_elem_per_reg;
         }
 
-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
         {
             // Loading the input values
             xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -394,7 +421,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
             x0 += n_elem_per_reg;
         }
 
-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
         {
             // Loading the input values
             __m256d x_vec = _mm256_loadu_pd(x0);
@@ -416,7 +443,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
        */
         _mm256_zeroupper();
 
-        for (; (i + 1) < n; i += 2)
+        for (; (i + 1) < n0; i += 2)
         {
             // Loading the input values
             __m128d x_vec = _mm_loadu_pd(x0);
@@ -433,7 +460,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
 
     const double alphac = *alpha;
 
-    for (; i < n; ++i)
+    for (; i < n0; ++i)
     {
         *x0 *= alphac;
 
@@ -468,13 +495,14 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
     Deviation from BLAS
     --------------------
 
-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.
 
     Undefined behaviour
     -------------------
 
-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_zdscalv_zen_int_avx512
      (
@@ -491,6 +519,31 @@ void bli_zdscalv_zen_int_avx512
         alpha is passed as double complex to adhere
         to function pointer definition in BLIS
     */
+
+    // If the vector dimension is zero, or if alpha is unit, return early.
+    if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
+
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
+    {
+        // Expert interface of setv is invoked when alpha is zero
+        dcomplex *zero = bli_z0;
+
+        /* When alpha is zero all the element in x are set to zero */
+        PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
+        (
+            BLIS_NO_CONJUGATE,
+            n,
+            zero,
+            x, incx,
+            cntx,
+            NULL);
+
+        return;
+    }
+
+    dim_t n0 = bli_abs(n);
+
     const double alphac = (*alpha).real;
 
     dim_t i = 0;
@@ -504,7 +557,7 @@ void bli_zdscalv_zen_int_avx512
 
         alphav = _mm512_set1_pd(alphac);
 
-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
         {
             xv[0] = _mm512_loadu_pd(x0);
             xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -524,7 +577,7 @@ void bli_zdscalv_zen_int_avx512
             x0 += 4 * n_elem_per_reg;
         }
 
-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
         {
             xv[0] = _mm512_loadu_pd(x0);
             xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -538,7 +591,7 @@ void bli_zdscalv_zen_int_avx512
             x0 += 2 * n_elem_per_reg;
         }
 
-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
         {
             xv[0] = _mm512_loadu_pd(x0);
 
@@ -549,7 +602,7 @@ void bli_zdscalv_zen_int_avx512
             x0 += n_elem_per_reg;
         }
 
-        for (; (i + 1) < n; i += 2)
+        for (; (i + 1) < n0; i += 2)
         {
             __m256d xv = _mm256_loadu_pd(x0);
 
@@ -576,7 +629,7 @@ void bli_zdscalv_zen_int_avx512
 
     alpha_reg = _mm_set1_pd((*alpha).real);
 
-    for (; i < n; ++i)
+    for (; i < n0; ++i)
     {
         x_vec = _mm_loadu_pd(x0);
 
@@ -674,8 +727,8 @@ void bli_zdscalv_zen_int_avx512
     Undefined behaviour
     -------------------
 
-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_cscalv_zen_int_avx512
      (
@@ -689,14 +742,11 @@ void bli_cscalv_zen_int_avx512
     // If the vector dimension is zero, or if alpha is unit, return early.
     if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
 
-    /**
-     * @note Currently this kernel is not BLAS compliant. For BLAS compliance,
-     * the below call to SETV needs to be removed.
-     */
-    if ( PASTEMAC(c,eq0)(*alpha) )
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
     {
         // Expert interface of setv is invoked when alpha is zero
-        scomplex *zero = PASTEMAC(c,0);
+        scomplex *zero = bli_c0;
 
         /* When alpha is zero all the element in x are set to zero */
         PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -712,6 +762,8 @@ void bli_cscalv_zen_int_avx512
         return;
     }
 
+    dim_t n0 = bli_abs(n);
+
     dim_t i = 0;
     scomplex alpha_conj;
     float* restrict x0 = (float*) x;
@@ -760,7 +812,7 @@ void bli_cscalv_zen_int_avx512
          */
 
         // Processing 96 scomplex elements (192 floats) per iteration
-        for ( ; (i + 95) < n; i += 96 )
+        for ( ; (i + 95) < n0; i += 96 )
         {
             __m512 xv[12], inter[12];
 
@@ -776,7 +828,7 @@ void bli_cscalv_zen_int_avx512
         }
 
         // Processing 64 scomplex elements (128 floats) per iteration
-        for ( ; (i + 63) < n; i += 64 )
+        for ( ; (i + 63) < n0; i += 64 )
         {
             __m512 xv[8], inter[8];
 
@@ -790,7 +842,7 @@ void bli_cscalv_zen_int_avx512
         }
 
         // Processing 32 scomplex elements (64 floats) per iteration
-        for ( ; (i + 31) < n; i += 32 )
+        for ( ; (i + 31) < n0; i += 32 )
         {
             __m512 xv[4], inter[4];
 
@@ -802,7 +854,7 @@ void bli_cscalv_zen_int_avx512
         }
 
         // Processing 16 scomplex elements (32 floats) per iteration
-        for ( ; (i + 15) < n; i += 16 )
+        for ( ; (i + 15) < n0; i += 16 )
         {
             __m512 xv[2], inter[2];
 
@@ -842,7 +894,7 @@ void bli_cscalv_zen_int_avx512
         }
 
         // Processing 8 scomplex elements (16 floats) per iteration
-        for ( ; (i + 7) < n; i += 8 )
+        for ( ; (i + 7) < n0; i += 8 )
         {
             __m512 xv[1], inter[1];
 
@@ -877,21 +929,23 @@ void bli_cscalv_zen_int_avx512
         }
 
         // Processing remaining elements, if any.
-        if ( i < n ) {
+        if ( i < n0 )
+        {
             // Setting the mask bit based on remaining elements.
             // Since each scomplex element corresponds to 2 floats,
-            // we need to load and store 2*(n-i) elements.
+            // we need to load and store 2*(n0-i) elements.
 
-            __mmask16 mask = ( 1 << ( 2 * ( n - i ) ) ) - 1;
+            __mmask16 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
+
+            __m512 xv, temp;
 
-            __m512 xv, inter;
             xv = _mm512_maskz_loadu_ps( mask, x0 );
 
-            inter = _mm512_permute_ps( xv, 0xB1 );
+            temp = _mm512_permute_ps( xv, 0xB1 );
 
-            inter = _mm512_mul_ps( alphaIv, inter );
+            temp = _mm512_mul_ps( alphaIv, temp );
 
-            xv = _mm512_fmaddsub_ps( alphaRv, xv, inter );
+            xv = _mm512_fmaddsub_ps( alphaRv, xv, temp );
 
             _mm512_mask_storeu_ps( x0, mask, xv );
         }
@@ -902,7 +956,7 @@ void bli_cscalv_zen_int_avx512
         const float alphaI = alpha_conj.imag;
 
         float x0R, x0I;
-        for (; i < n; ++i)
+        for (; i < n0; ++i)
         {
             x0R = *(x0);
             x0I = *(x0 + 1);
@@ -942,13 +996,14 @@ void bli_cscalv_zen_int_avx512
     Deviation from BLAS
     --------------------
 
-    None
+    Setv is used when alpha=0 unless a negative value of n is supplied.
+    This only occurs in calls from BLAS and CBLAS scal APIs.
 
     Undefined behaviour
     -------------------
 
-    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
-       is that these are standard BLAS exceptions and should be handled in a higher layer.
+    None
+
 */
 void bli_zscalv_zen_int_avx512
      (
@@ -960,17 +1015,13 @@ void bli_zscalv_zen_int_avx512
      )
 {
     // If the vector dimension is zero, or if alpha is unit, return early.
-    if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
-        return;
+    if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
 
-    /**
-     * @note Currently this kernel is not BLAS compliant. For BLAS compliance,
-     * the below call to SETV needs to be removed.
-    */
-    if (PASTEMAC(z, eq0)(*alpha))
+    // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
+    if (PASTEMAC(z,eq0)( *alpha ) && n > 0 )
     {
         // Expert interface of setv is invoked when alpha is zero
-        dcomplex *zero = PASTEMAC(z, 0);
+        dcomplex *zero = bli_z0;
 
         /* When alpha is zero all the element in x are set to zero */
         PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -985,6 +1036,8 @@ void bli_zscalv_zen_int_avx512
         return;
     }
 
+    dim_t n0 = bli_abs(n);
+
     dim_t i = 0;
     dcomplex alpha_conj;
     double *restrict x0 = (double *)x;
@@ -1022,7 +1075,7 @@ void bli_zscalv_zen_int_avx512
         */
 
         // Processing 48 dcomplex elements per iteration.
-        for (; (i + 47) < n; i += 48)
+        for (; (i + 47) < n0; i += 48)
         {
             __m512d xv[12], temp[12];
 
@@ -1116,7 +1169,7 @@ void bli_zscalv_zen_int_avx512
         }
 
         // Processing 32 dcomplex elements per iteration.
-        for (; (i + 31) < n; i += 32)
+        for (; (i + 31) < n0; i += 32)
         {
             __m512d xv[8], temp[8];
             xv[0] = _mm512_loadu_pd(x0);
@@ -1173,7 +1226,7 @@ void bli_zscalv_zen_int_avx512
         }
 
         // Processing 16 dcomplex elements per iteration.
-        for (; (i + 15) < n; i += 16)
+        for (; (i + 15) < n0; i += 16)
         {
             __m512d xv[4], temp[4];
             xv[0] = _mm512_loadu_pd(x0);
@@ -1205,7 +1258,7 @@ void bli_zscalv_zen_int_avx512
         }
 
         // Processing 8 dcomplex elements per iteration.
-        for (; (i + 7) < n; i += 8)
+        for (; (i + 7) < n0; i += 8)
         {
             __m512d xv[2], temp[2];
             xv[0] = _mm512_loadu_pd(x0);
@@ -1227,7 +1280,7 @@ void bli_zscalv_zen_int_avx512
         }
 
         // Processing 4 dcomplex elements per iteration.
-        for (; (i + 3) < n; i += 4)
+        for (; (i + 3) < n0; i += 4)
         {
             __m512d xv, temp;
             xv = _mm512_loadu_pd(x0);
@@ -1244,23 +1297,24 @@ void bli_zscalv_zen_int_avx512
         }
 
         // Processing the remainder elements.
-        if( i < n )
+        if( i < n0 )
         {
             // Setting the mask bit based on remaining elements
             // Since each dcomplex elements corresponds to 2 doubles
-            // we need to load and store 2*(m-i) elements.
-            __mmask8 mask = (1 << (2 * (n-i)) ) - 1;
+            // we need to load and store 2*(n0-i) elements.
+
+            __mmask8 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
 
             __m512d xv, temp, zero;
             zero = _mm512_setzero_pd();
 
             xv = _mm512_mask_loadu_pd( zero, mask, x0 );
 
-            temp = _mm512_permute_pd(xv, 0x55);
+            temp = _mm512_permute_pd( xv, 0x55 );
 
-            temp = _mm512_mul_pd(alphaIv, temp);
+            temp = _mm512_mul_pd( alphaIv, temp );
 
-            xv = _mm512_fmaddsub_pd(alphaRv, xv, temp);
+            xv = _mm512_fmaddsub_pd( alphaRv, xv, temp );
 
             _mm512_mask_storeu_pd( x0, mask, xv );
         }
@@ -1272,7 +1326,7 @@ void bli_zscalv_zen_int_avx512
         alphaRv = _mm_loaddup_pd(&alphaR);
         alphaIv = _mm_loaddup_pd(&alphaI);
 
-        for (; i < n; ++i)
+        for (; i < n0; ++i)
         {
             x_vec = _mm_loadu_pd(x0);
 
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 4945b637b..29d55e626 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
 \
 	/* If alpha is zero, use setv. */ \
-	if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	if ( PASTEMAC(ch,eq0)( *alpha ) && n > 0) \
 	{ \
 		ctype* zero = PASTEMAC(ch,0); \
 \
@@ -70,6 +71,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		); \
 		return; \
 	} \
+\
+        dim_t n0 = bli_abs(n); \
 \
 	ctype alpha_conj; \
 \
@@ -78,14 +81,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	if ( incx == 1 ) \
 	{ \
 		PRAGMA_SIMD \
-		for ( dim_t i = 0; i < n; ++i ) \
+		for ( dim_t i = 0; i < n0; ++i ) \
 		{ \
 			PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
 		} \
 	} \
 	else \
 	{ \
-		for ( dim_t i = 0; i < n; ++i ) \
+		for ( dim_t i = 0; i < n0; ++i ) \
 		{ \
 			PASTEMAC(ch,scals)( alpha_conj, *x ); \
 \