diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c index 8d065f135..904f0c9e7 100644 --- a/frame/compat/bla_scal.c +++ b/frame/compat/bla_scal.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -40,7 +40,7 @@ // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCSCAL -#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \ +#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \ \ void PASTEF772S(chx,cha,blasname) \ ( \ @@ -50,44 +50,42 @@ void PASTEF772S(chx,cha,blasname) \ ) \ { \ AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ - dim_t n0; \ - ftype_x* x0; \ - inc_t incx0; \ - ftype_x alpha_cast; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ - if (*n == 0 || alpha == NULL) { \ + dim_t n0 = (dim_t)(*n); \ + ftype_x *x0 = x; \ + inc_t incx0 = (inc_t)(*incx); \ +\ + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \ + { \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ return ; \ } \ -\ - /* Convert/typecast negative values of n to zero. */ \ - bli_convert_blas_dim1( *n, n0 ); \ -\ - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ \ - bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ \ /* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS. that is, we just always sub-optimally implement those cases by casting alpha to ctype_x (potentially the complex domain) and using the homogeneous datatype instance according to that type. */ \ + ftype_x alpha_cast; \ PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ \ /* Call BLIS interface. */ \ + /* Pass size as negative to stipulate don't use SETV when alpha=0 */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - n0, \ + -n0, \ &alpha_cast, \ x0, incx0, \ NULL, \ NULL \ ); \ \ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ }\ diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c index 837b3f62a..7dad24a3c 100644 --- a/frame/compat/bla_scal_amd.c +++ b/frame/compat/bla_scal_amd.c @@ -50,13 +50,16 @@ 1. When alpha == NaN - Propogate the NaN to the vector 2. When alpha == 0 - Perform the SCALV operation completely and don't use setv. + As SCALV kernels are used in many other BLAS APIs where we want setv to be + used in this scenario, here we call the kernels with n=-n to signify that + setv should not be used. */ // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCSCAL -#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \ +#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \ \ void PASTEF772S(chx,cha,blasname) \ ( \ @@ -66,55 +69,42 @@ void PASTEF772S(chx,cha,blasname) \ ) \ { \ AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \ - dim_t n0; \ - ftype_x* x0; \ - inc_t incx0; \ - ftype_x alpha_cast; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ - if (*n == 0 || alpha == NULL) { \ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ - return ; \ - } \ + dim_t n0 = (dim_t)(*n); \ + ftype_x *x0 = x; \ + inc_t incx0 = (inc_t)(*incx); \ \ - /* Convert/typecast negative values of n to zero. */ \ - bli_convert_blas_dim1( *n, n0 ); \ -\ - /* If the input increments are less than or equal to zero, return. */ \ - if ( (*incx) <= 0 ) { \ + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \ + { \ AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ return ; \ - } else { \ - incx0 = ( inc_t )(*incx); \ - x0 = (x); \ } \ \ /* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS. that is, we just always sub-optimally implement those cases by casting alpha to ctype_x (potentially the complex domain) and using the homogeneous datatype instance according to that type. */ \ + ftype_x alpha_cast; \ PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ -\ - /* If alpha is a unit scalar, return early. */ \ - if ( PASTEMAC(c, eq1)(alpha_cast) ) { \ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \ - return ; \ - } \ \ /* Call BLIS interface. */ \ + /* Pass size as negative to stipulate don't use SETV when alpha=0 */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ - n0, \ + -n0, \ &alpha_cast, \ x0, incx0, \ NULL, \ NULL \ ); \ \ - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ }\ @@ -139,82 +129,72 @@ void sscal_blis_impl { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx ); - dim_t n0; - float* x0; - inc_t incx0; + /* Initialize BLIS. */ //bli_init_auto(); - if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha)) + dim_t n0 = (dim_t)(*n); + float *x0 = x; + inc_t incx0 = (inc_t)(*incx); + + /* + Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception + Return early when alpha pointer is NULL - BLIS exception + */ + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha)) { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + //bli_finalize_auto(); + return; } - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); - - /* If the input increments are less than or equal to zero, return. */ - if ( (*incx) <= 0 ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return ; - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } + // Definition of function pointer + sscalv_ker_ft scalv_ker_ptr; cntx_t *cntx = NULL; // Query the architecture ID arch_t id = bli_arch_query_id(); - /* - Function pointer declaration for the function - that will be used by this API - */ - sscalv_ker_ft scalv_ker_ptr; // DSCALV - // Pick the kernel based on the architecture ID switch (id) { - case BLIS_ARCH_ZEN5: - case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN5: + case BLIS_ARCH_ZEN4: #if defined(BLIS_KERNELS_ZEN4) - scalv_ker_ptr = bli_sscalv_zen_int_avx512; - - break; + scalv_ker_ptr = bli_sscalv_zen_int_avx512; + break; #endif - case BLIS_ARCH_ZEN: - case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN3: - scalv_ker_ptr = bli_sscalv_zen_int10; + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: + scalv_ker_ptr = bli_sscalv_zen_int10; + break; - break; - default: + default: - // For non-Zen architectures, query the context - cntx = bli_gks_query_cntx(); + // For non-Zen architectures, query the context + cntx = bli_gks_query_cntx(); - // Query the context for the kernel function pointers for sscalv - scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx); + // Query the context for the kernel function pointers for sscalv + scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx); } + // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 scalv_ker_ptr ( BLIS_NO_CONJUGATE, - n0, + -n0, (float *)alpha, x0, incx0, cntx ); - /* Finalize BLIS. */ - // bli_finalize_auto(); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + //bli_finalize_auto(); } #ifdef BLIS_ENABLE_BLAS void sscal_ @@ -224,7 +204,7 @@ void sscal_ float* x, const f77_int* incx ) { - sscal_blis_impl( n, alpha, x, incx ); + sscal_blis_impl( n, alpha, x, incx ); } #endif void dscal_blis_impl @@ -236,65 +216,54 @@ void dscal_blis_impl { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx ); - dim_t n_elem; -#ifdef BLIS_ENABLE_OPENMP - dim_t ST_THRESH = 30000; -#endif - double* x0; - inc_t incx0; - /* Initialize BLIS */ + /* Initialize BLIS. */ //bli_init_auto(); - /* Convert typecast negative values of n to zero. */ - if ( *n < 0 ) n_elem = ( dim_t )0; - else n_elem = ( dim_t )(*n); + dim_t n0 = (dim_t)(*n); + double *x0 = x; + inc_t incx0 = (inc_t)(*incx); /* Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception Return early when alpha pointer is NULL - BLIS exception */ - if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha)) + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha)) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + //bli_finalize_auto(); return; } - /* If the input increments are less than or equal to zero, return. */ - if ( (*incx) <= 0 ) - { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); - return ; - } - else - { - x0 = (x); - incx0 = ( inc_t )(*incx); - } - - // Definition of function pointer + // Definition of function pointer dscalv_ker_ft scalv_ker_ptr; cntx_t *cntx = NULL; +#ifdef BLIS_ENABLE_OPENMP + dim_t ST_THRESH = 30000; +#endif + // Query the architecture ID - arch_t arch_id_local = bli_arch_query_id(); + arch_t id = bli_arch_query_id(); // Pick the kernel based on the architecture ID - switch (arch_id_local) + switch (id) { - case BLIS_ARCH_ZEN5: - case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN5: + case BLIS_ARCH_ZEN4: #if defined(BLIS_KERNELS_ZEN4) - scalv_ker_ptr = bli_dscalv_zen_int_avx512; + // AVX512 Kernel + scalv_ker_ptr = bli_dscalv_zen_int_avx512; #ifdef BLIS_ENABLE_OPENMP - ST_THRESH = 30000; + ST_THRESH = 30000; #endif - break; + break; #endif - case BLIS_ARCH_ZEN: - case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN3: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: // AVX2 Kernel scalv_ker_ptr = bli_dscalv_zen_int10; @@ -303,9 +272,9 @@ void dscal_blis_impl #endif break; - default: + default: - // Query the context + // For non-Zen architectures, query the context cntx = bli_gks_query_cntx(); // Query the function pointer using the context @@ -315,25 +284,28 @@ void dscal_blis_impl #ifdef BLIS_ENABLE_OPENMP /* - If the optimial number of threads is 1, the OpenMP and - 'bli_nthreads_l1'overheads are avoided by calling the + If the optimal number of threads is 1, the OpenMP and + 'bli_nthreads_l1' overheads are avoided by calling the function directly. This ensures that performance of dscalv does not drop for single thread when OpenMP is enabled. */ - if (n_elem <= ST_THRESH) + if (n0 <= ST_THRESH) { #endif + // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 scalv_ker_ptr ( BLIS_NO_CONJUGATE, - n_elem, + -n0, (double *)alpha, x0, incx0, cntx ); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - + /* Finalize BLIS. */ + //bli_finalize_auto(); return; #ifdef BLIS_ENABLE_OPENMP } @@ -354,14 +326,14 @@ void dscal_blis_impl BLIS_SCALV_KER, BLIS_DOUBLE, BLIS_DOUBLE, - arch_id_local, - n_elem, + id, + n0, &nt ); _Pragma("omp parallel num_threads(nt)") { - dim_t start, end, length; + dim_t start, end, length; thrinfo_t thrinfo_vec; // The block size is the minimum factor, whose multiple will ensure that only @@ -383,7 +355,7 @@ void dscal_blis_impl bli_thread_range_sub ( &thrinfo_vec, - n_elem, + n0, block_size, FALSE, &start, @@ -396,22 +368,21 @@ void dscal_blis_impl double *x_thread_local = x0 + (start * incx0); // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 scalv_ker_ptr ( BLIS_NO_CONJUGATE, - length, + -length, (double *)alpha, x_thread_local, incx0, cntx ); } - /* Finalize BLIS. */ - // bli_finalize_auto(); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - + /* Finalize BLIS. */ + //bli_finalize_auto(); #endif - } #ifdef BLIS_ENABLE_BLAS void dscal_ @@ -421,7 +392,7 @@ void dscal_ double* x, const f77_int* incx ) { - dscal_blis_impl( n, alpha, x, incx ); + dscal_blis_impl( n, alpha, x, incx ); } #endif void zdscal_blis_impl @@ -433,19 +404,23 @@ void zdscal_blis_impl { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx ); - dim_t n_elem = (dim_t)(*n); - dcomplex* x0 = x; - inc_t incx0 = (inc_t)(*incx); + /* Initialize BLIS. */ //bli_init_auto(); + dim_t n0 = (dim_t)(*n); + dcomplex* x0 = x; + inc_t incx0 = (inc_t)(*incx); + /* - When n is zero or the alpha pointer passed is null - or the incx is zero or alpha is 1, return early. + Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception + Return early when alpha pointer is NULL - BLIS exception */ - if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha)) + if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha)) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + //bli_finalize_auto(); return; } @@ -458,30 +433,34 @@ void zdscal_blis_impl cntx_t *cntx = NULL; +#ifdef BLIS_ENABLE_OPENMP + dim_t ST_THRESH = 10000; +#endif + // Query the architecture ID - arch_t arch_id_local = bli_arch_query_id(); + arch_t id = bli_arch_query_id(); // Pick the kernel based on the architecture ID - switch (arch_id_local) + switch (id) { - case BLIS_ARCH_ZEN5: - case BLIS_ARCH_ZEN4: + case BLIS_ARCH_ZEN5: + case BLIS_ARCH_ZEN4: #if defined(BLIS_KERNELS_ZEN4) // AVX512 Kernel scalv_ker_ptr = bli_zdscalv_zen_int_avx512; break; #endif - case BLIS_ARCH_ZEN: - case BLIS_ARCH_ZEN2: - case BLIS_ARCH_ZEN3: + case BLIS_ARCH_ZEN: + case BLIS_ARCH_ZEN2: + case BLIS_ARCH_ZEN3: // AVX2 Kernel scalv_ker_ptr = bli_zdscalv_zen_int10; break; - default: + default: - // Query the context + // For non-Zen architectures, query the context cntx = bli_gks_query_cntx(); // Query the function pointer using the context @@ -489,6 +468,32 @@ void zdscal_blis_impl } #ifdef BLIS_ENABLE_OPENMP + /* + If the optimal number of threads is 1, the OpenMP and + 'bli_nthreads_l1' overheads are avoided by calling the + function directly. This ensures that performance of dscalv + does not drop for single thread when OpenMP is enabled. + */ + if (n0 <= ST_THRESH) + { +#endif + // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 + scalv_ker_ptr + ( + BLIS_NO_CONJUGATE, + -n0, + (dcomplex *)&alpha_cast, + x0, incx0, + cntx + ); + + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + //bli_finalize_auto(); + return; +#ifdef BLIS_ENABLE_OPENMP + } /* Initializing the number of thread to one @@ -506,33 +511,11 @@ void zdscal_blis_impl BLIS_SCALV_KER, BLIS_DCOMPLEX, BLIS_DOUBLE, - arch_id_local, - n_elem, + id, + n0, &nt ); - /* - If the number of optimum threads is 1, the OpenMP overhead - is avoided by calling the function directly - */ - if (nt == 1) - { -#endif - scalv_ker_ptr - ( - BLIS_NO_CONJUGATE, - n_elem, - (dcomplex *)&alpha_cast, - x0, incx0, - cntx - ); - - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) - - return; -#ifdef BLIS_ENABLE_OPENMP - } - _Pragma("omp parallel num_threads(nt)") { dim_t start, length; @@ -549,7 +532,7 @@ void zdscal_blis_impl */ bli_thread_vector_partition ( - n_elem, + n0, nt_use, &start, &length, thread_id @@ -559,18 +542,21 @@ void zdscal_blis_impl dcomplex *x_thread_local = x0 + (start * incx0); // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 scalv_ker_ptr ( BLIS_NO_CONJUGATE, - length, + -length, (dcomplex *)&alpha_cast, x_thread_local, incx0, cntx ); } -#endif AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + //bli_finalize_auto(); +#endif } #ifdef BLIS_ENABLE_BLAS void zdscal_ @@ -594,22 +580,27 @@ void cscal_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', (void *)alpha, *n, *incx); + /* Initialize BLIS. */ + //bli_init_auto(); + dim_t n0 = (dim_t)(*n); scomplex *x0 = x; inc_t incx0 = (inc_t)(*incx); /* - When n is zero or the alpha pointer passed is null - or the incx is zero or alpha is 1, return early. + Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception + Return early when alpha pointer is NULL - BLIS exception */ if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha)) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + //bli_finalize_auto(); return; } // Definition of function pointer - cscalv_ker_ft scalv_fun_ptr; + cscalv_ker_ft scalv_ker_ptr; cntx_t* cntx = NULL; @@ -622,40 +613,42 @@ void cscal_blis_impl case BLIS_ARCH_ZEN5: case BLIS_ARCH_ZEN4: #if defined(BLIS_KERNELS_ZEN4) - // AVX512 Kernel - scalv_fun_ptr = bli_cscalv_zen_int_avx512; - break; + // AVX512 Kernel + scalv_ker_ptr = bli_cscalv_zen_int_avx512; + break; #endif case BLIS_ARCH_ZEN: case BLIS_ARCH_ZEN2: case BLIS_ARCH_ZEN3: - // AVX2 Kernel - scalv_fun_ptr = bli_cscalv_zen_int; - break; + // AVX2 Kernel + scalv_ker_ptr = bli_cscalv_zen_int; + break; default: - // Query the context + // For non-Zen architectures, query the context cntx = bli_gks_query_cntx(); // Query the function pointer using the context - scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx); + scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx); } - // Call the function based on the function pointer assigned above - scalv_fun_ptr + // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 + scalv_ker_ptr ( BLIS_NO_CONJUGATE, - n0, + -n0, (scomplex*) alpha, x0, incx0, cntx ); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + //bli_finalize_auto(); } - #ifdef BLIS_ENABLE_BLAS void cscal_ ( @@ -678,22 +671,27 @@ void zscal_blis_impl AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx); + /* Initialize BLIS. */ + //bli_init_auto(); + dim_t n0 = (dim_t)(*n); dcomplex *x0 = x; inc_t incx0 = (inc_t)(*incx); /* - When n is zero or the alpha pointer passed is null - or the incx is zero or alpha is 1, return early. + Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception + Return early when alpha pointer is NULL - BLIS exception */ if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha)) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); + /* Finalize BLIS. */ + //bli_finalize_auto(); return; } // Definition of function pointer - zscalv_ker_ft scalv_fun_ptr; + zscalv_ker_ft scalv_ker_ptr; cntx_t* cntx = NULL; @@ -707,7 +705,7 @@ void zscal_blis_impl case BLIS_ARCH_ZEN4: #if defined(BLIS_KERNELS_ZEN4) // AVX512 Kernel - scalv_fun_ptr = bli_zscalv_zen_int_avx512; + scalv_ker_ptr = bli_zscalv_zen_int_avx512; break; #endif case BLIS_ARCH_ZEN: @@ -715,29 +713,32 @@ void zscal_blis_impl case BLIS_ARCH_ZEN3: // AVX2 Kernel - scalv_fun_ptr = bli_zscalv_zen_int; + scalv_ker_ptr = bli_zscalv_zen_int; break; default: - // Query the context + // For non-Zen architectures, query the context cntx = bli_gks_query_cntx(); // Query the function pointer using the context - scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx); + scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx); } - // Call the function based on the function pointer assigned above - scalv_fun_ptr + // Invoke the function based on the kernel function pointer + // Pass size as negative to stipulate don't use SETV when alpha=0 + scalv_ker_ptr ( BLIS_NO_CONJUGATE, - n0, + -n0, (dcomplex*) alpha, x0, incx0, cntx ); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) + /* Finalize BLIS. */ + //bli_finalize_auto(); } #ifdef BLIS_ENABLE_BLAS void zscal_ @@ -751,4 +752,4 @@ void zscal_ } #endif -GENTFUNCSCAL( scomplex, float, c, s, scal, scalv ) +GENTFUNCSCAL( scomplex, float, c, s, s, scal, scalv ) diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index fa3ea5201..940f0f2e8 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -174,12 +174,12 @@ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ -GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ -GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ -GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ -GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ -GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) +GENTFUNCSCAL( float, float, s, , s, blasname, blisname ) \ +GENTFUNCSCAL( double, double, d, , d, blasname, blisname ) \ +GENTFUNCSCAL( scomplex, scomplex, c, , c, blasname, blisname ) \ +GENTFUNCSCAL( dcomplex, dcomplex, z, , z, blasname, blisname ) \ +GENTFUNCSCAL( scomplex, float, c, s, s, blasname, blisname ) \ +GENTFUNCSCAL( dcomplex, double, z, d, d, blasname, blisname ) // --GEMMT specific kernels ---------------------------------------------------- diff --git a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp index ee77340d5..f259892eb 100644 --- a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp @@ -36,10 +36,10 @@ #include "test_scalv.h" class cscalvGeneric : - public ::testing::TestWithParam> {}; + public ::testing::TestWithParam> {}; // alpha // Tests using random integers as vector elements. @@ -78,42 +78,140 @@ TEST_P( cscalvGeneric, API ) test_scalv( conj_alpha, n, incx, alpha, thresh ); } -// Black box testing for generic and main use of cscal. +// Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - Blackbox, + unitPositiveIncrementSmall, cscalvGeneric, ::testing::Combine( - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), // n: use x, c: use conj(x) - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + scomplex{-5.1, -7.3}, + scomplex{ 1.0, 1.0}, + scomplex{ 7.3, 5.1} + ) ), (::scalvGenericPrint()) ); +// Black box testing for generic use of dscal. +INSTANTIATE_TEST_SUITE_P( + unitPositiveIncrementLarge, + cscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + scomplex{-5.1, -7.3}, + scomplex{ 1.0, 1.0}, + scomplex{ 7.3, 5.1} + ) + ), + (::scalvGenericPrint()) + ); -// Test for non-unit increments. -// Only test very few cases as sanity check. +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementSmall, + cscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(9), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + scomplex{-5.1, -7.3}, + scomplex{ 1.0, 1.0}, + scomplex{ 7.3, 5.1} + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + cscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + scomplex{-5.1, -7.3}, + scomplex{ 1.0, 1.0}, + scomplex{ 7.3, 5.1} + ) + ), + (::scalvGenericPrint()) + ); + +#ifndef TEST_BLIS_TYPED +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + cscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + scomplex{ 0.0, 0.0} + ) + ), + (::scalvGenericPrint()) + ); +#endif + +#ifdef TEST_BLIS_TYPED +// Test when conjugate of x is used as an argument. This option is BLIS-api specific. +// Only test very few cases as sanity check since conj(x) = x for real types. // We can modify the values using implementantion details. INSTANTIATE_TEST_SUITE_P( - NonUnitPositiveIncrements, + conjalpha, cscalvGeneric, ::testing::Combine( - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), // n: use x, c: use conj(x) - ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(scomplex{4.0, 3.1}) // alpha + ::testing::Values('c'), // c: use conjugate + ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(scomplex{ 7.3, 5.1}) // alpha ), (::scalvGenericPrint()) ); +#endif #ifndef TEST_BLIS_TYPED // Test for negative increments. @@ -126,7 +224,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('n'), // n: use x, c: use conj(x) ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x - ::testing::Values(scomplex{4.0, 3.1}) // alpha + ::testing::Values(scomplex{ 7.3, 5.1}) // alpha ), (::scalvGenericPrint()) ); diff --git a/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp new file mode 100644 index 000000000..d09afe843 --- /dev/null +++ b/gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp @@ -0,0 +1,219 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include "test_scalv.h" + +class csscalvGeneric : + public ::testing::TestWithParam> {}; // alpha + +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csscalvGeneric); + +// Tests using random integers as vector elements. +TEST_P( csscalvGeneric, API ) +{ + using T = scomplex; + using U = float; + //---------------------------------------------------------- + // Initialize values from the parameters passed through + // test suite instantiation (INSTANTIATE_TEST_SUITE_P). + //---------------------------------------------------------- + // denotes whether alpha or conj(alpha) will be used: + char conj_alpha = std::get<0>(GetParam()); + // vector length: + gtint_t n = std::get<1>(GetParam()); + // stride size for x: + gtint_t incx = std::get<2>(GetParam()); + // alpha + U alpha = std::get<3>(GetParam()); + + // Set the threshold for the errors: + // Check gtestsuite scalv.h or netlib source code for reminder of the + // functionality from which we estimate operation count per element + // of output, and hence the multipler for epsilon. + // No adjustment applied yet for complex data. + double thresh; + if (n == 0) + thresh = 0.0; + else if (alpha == testinghelpers::ZERO() || alpha == testinghelpers::ONE()) + thresh = 0.0; + else + thresh = testinghelpers::getEpsilon(); + + //---------------------------------------------------------- + // Call generic test body using those parameters + //---------------------------------------------------------- + test_scalv( conj_alpha, n, incx, alpha, thresh ); +} + +// bli_csscal not present in BLIS +#ifndef TEST_BLIS_TYPED + +// Black box testing for generic use of dscal. +INSTANTIATE_TEST_SUITE_P( + unitPositiveIncrementSmall, + csscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +// Black box testing for generic use of dscal. +INSTANTIATE_TEST_SUITE_P( + unitPositiveIncrementLarge, + csscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementSmall, + csscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(9), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + csscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + csscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 0.0) + ) + ), + (::scalvGenericPrint()) + ); + +// Test for negative increments. +// Only test very few cases as sanity check. +// We can modify the values using implementantion details. +INSTANTIATE_TEST_SUITE_P( + NegativeIncrements, + csscalvGeneric, + ::testing::Combine( + ::testing::Values('n'), // n: use x, c: use conj(x) + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x + ::testing::Values(3) // alpha + ), + (::scalvGenericPrint()) + ); + +#endif // not TEST_BLIS_TYPED + + + + + + diff --git a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp index 1ca853db2..0f5f2f203 100644 --- a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp @@ -79,20 +79,41 @@ TEST_P( dscalvGeneric, API ) // Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - unitPositiveIncrement, + unitPositiveIncrementSmall, dscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + double( 7.0), + double(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +// Black box testing for generic use of dscal. +INSTANTIATE_TEST_SUITE_P( + unitPositiveIncrementLarge, + dscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), // incx: stride of x vector. ::testing::Values( gtint_t(1) ), // alpha: value of scalar. ::testing::Values( - double( 0.0), double( 7.0), double(-3.0) ) @@ -101,21 +122,20 @@ INSTANTIATE_TEST_SUITE_P( ); INSTANTIATE_TEST_SUITE_P( - nonUnitPositiveIncrement, + nonUnitPositiveIncrementSmall, dscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Range(gtint_t(1), gtint_t(9), 1), // incx: stride of x vector. ::testing::Values( gtint_t(2), - gtint_t(3) + gtint_t(41) ), // alpha: value of scalar. ::testing::Values( - double( 0.0), double( 7.0), double(-3.0) ) @@ -123,6 +143,54 @@ INSTANTIATE_TEST_SUITE_P( (::scalvGenericPrint()) ); +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + dscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 7.0), + double(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +#ifndef TEST_BLIS_TYPED +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + dscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 0.0) + ) + ), + (::scalvGenericPrint()) + ); +#endif + #ifdef TEST_BLIS_TYPED // Test when conjugate of x is used as an argument. This option is BLIS-api specific. // Only test very few cases as sanity check since conj(x) = x for real types. @@ -140,6 +208,23 @@ INSTANTIATE_TEST_SUITE_P( ); #endif +#ifndef TEST_BLIS_TYPED +// Test for negative increments. +// Only test very few cases as sanity check. +// We can modify the values using implementantion details. +INSTANTIATE_TEST_SUITE_P( + NegativeIncrements, + dscalvGeneric, + ::testing::Combine( + ::testing::Values('n'), // n: use x, c: use conj(x) + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x + ::testing::Values(3) // alpha + ), + (::scalvGenericPrint()) + ); +#endif + #if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC) INSTANTIATE_TEST_SUITE_P( AOCLDynamic, @@ -151,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( gtint_t( 30000), // nt_ideal = 1 gtint_t( 100000), // nt_ideal = 2 + gtint_t( 486919), // nt_ideal = 8 gtint_t( 500000), // nt_ideal = 8 gtint_t( 2500000), // nt_ideal = 12 gtint_t( 4000000), // nt_ideal = 16 @@ -160,7 +246,8 @@ INSTANTIATE_TEST_SUITE_P( ), // incx: stride of x vector. ::testing::Values( - gtint_t(1) + gtint_t(1), + gtint_t(3) ), // alpha: value of scalar. ::testing::Values( @@ -169,4 +256,34 @@ INSTANTIATE_TEST_SUITE_P( ), (::scalvGenericPrint()) ); + +#ifndef TEST_BLIS_TYPED +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + AOCLDynamicAlphaZero, + dscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values( + gtint_t( 89), // nt_ideal = 8 + gtint_t( 486919), // nt_ideal = 8 + gtint_t(25000000) // nt_ideal = max_available + ), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(3) + ), + // alpha: value of scalar. + ::testing::Values( + double( 0.0) + ) + ), + (::scalvGenericPrint()) + ); #endif + +#endif // BLIS_ENABLE_OPENMP && AOCL_DYNAMIC diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp deleted file mode 100644 index 8bf16f8dc..000000000 --- a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#include "test_scalv.h" - -template -class scalv_EIC : public ::testing::Test {}; -typedef ::testing::Types TypeParam; -TYPED_TEST_SUITE(scalv_EIC, TypeParam); - -TYPED_TEST(scalv_EIC, zero_alpha_x_fp) -{ - using T = TypeParam; - gtint_t n = 10, incx = 1; - std::vector x(n); - // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data() ); - std::vector x_ref(x); - T alpha = T{0}; - - testinghelpers::ref_scalv('n', n, alpha, x_ref.data(), incx); - //---------------------------------------------------------- - // Call BLIS function. - //---------------------------------------------------------- - scalv('n', n, alpha, x.data(), incx); - - //---------------------------------------------------------- - // Compute component-wise error. - //---------------------------------------------------------- - // Set the threshold for the errors: - // Check gtestsuite scalv.h or netlib source code for reminder of the - // functionality from which we estimate operation count per element - // of output, and hence the multipler for epsilon. - double thresh; - if (n == 0) - thresh = 0.0; - else if (alpha == testinghelpers::ZERO() || alpha == testinghelpers::ONE()) - thresh = 0.0; - else - thresh = testinghelpers::getEpsilon(); - - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - computediff( "x", n, x.data(), x_ref.data(), incx, thresh, true ); -} - -TYPED_TEST(scalv_EIC, zero_alpha_x_inf) -{ - using T = TypeParam; - gtint_t n = 10, incx = 1; - std::vector x(n); - // Initialize x with random numbers. - testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data() ); - x[3] = 1.0/0.0; - std::vector x_ref(x); - T alpha = T{0}; - testinghelpers::ref_scalv('n', n, alpha, x_ref.data(), incx); - - //---------------------------------------------------------- - // Call BLIS function. - //---------------------------------------------------------- - scalv('n', n, alpha, x.data(), incx); - - //---------------------------------------------------------- - // Compute component-wise error. - //---------------------------------------------------------- - // Set the threshold for the errors: - // Check gtestsuite scalv.h or netlib source code for reminder of the - // functionality from which we estimate operation count per element - // of output, and hence the multipler for epsilon. - // No adjustment applied yet for complex data. - double thresh; - if (n == 0) - thresh = 0.0; - else if (alpha == testinghelpers::ZERO() || alpha == testinghelpers::ONE()) - thresh = 0.0; - else - thresh = testinghelpers::getEpsilon(); - - //---------------------------------------------------------- - // Call generic test body using those parameters - //---------------------------------------------------------- - computediff( "x", n, x.data(), x_ref.data(), incx, thresh, true ); -} diff --git a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp index 12187bcd4..c45bb8337 100644 --- a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp @@ -63,7 +63,7 @@ TEST_P( sscalvGeneric, API ) // Check gtestsuite scalv.h or netlib source code for reminder of the // functionality from which we estimate operation count per element // of output, and hence the multipler for epsilon. - double thresh; + float thresh; if (n == 0) thresh = 0.0; else if (alpha == testinghelpers::ZERO() || alpha == testinghelpers::ONE()) @@ -77,19 +77,120 @@ TEST_P( sscalvGeneric, API ) test_scalv( conj_alpha, n, incx, alpha, thresh ); } -// Black box testing for generic and main use of sscal. +// Black box testing for generic use of sscal. INSTANTIATE_TEST_SUITE_P( - Blackbox, + unitPositiveIncrementSmall, sscalvGeneric, ::testing::Combine( - ::testing::Values('n'), // n: use x, not conj(x) (since it is real) - ::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(float(3.0), float(-5.0)) // alpha + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) ), (::scalvGenericPrint()) ); +// Black box testing for generic use of dscal. +INSTANTIATE_TEST_SUITE_P( + unitPositiveIncrementLarge, + sscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementSmall, + sscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(17), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + sscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + float( 7.0), + float(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +#ifndef TEST_BLIS_TYPED +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + sscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + float( 0.0) + ) + ), + (::scalvGenericPrint()) + ); +#endif + #ifdef TEST_BLIS_TYPED // Test when conjugate of x is used as an argument. This option is BLIS-api specific. // Only test very few cases as sanity check since conj(x) = x for real types. @@ -101,28 +202,12 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values('c'), // c: use conjugate ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. ::testing::Values(gtint_t(1)), // stride size for x - ::testing::Values(float(9.0)) // alpha + ::testing::Values(float(-3.0)) // alpha ), (::scalvGenericPrint()) ); #endif -// Test for non-unit increments. -// Only test very few cases as sanity check. -// We can modify the values using implementantion details. -INSTANTIATE_TEST_SUITE_P( - NonUnitPositiveIncrements, - sscalvGeneric, - ::testing::Combine( - ::testing::Values('n'), // n: use x - ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. - ::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x - ::testing::Values(float(2.0)) // alpha - ), - (::scalvGenericPrint()) - ); - - #ifndef TEST_BLIS_TYPED // Test for negative increments. // Only test very few cases as sanity check. diff --git a/gtestsuite/testsuite/level1/scalv/test_scalv.h b/gtestsuite/testsuite/level1/scalv/test_scalv.h index c045f6fcc..e4663da97 100644 --- a/gtestsuite/testsuite/level1/scalv/test_scalv.h +++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h @@ -48,6 +48,8 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub // Initialize vector with random numbers. //---------------------------------------------------------- std::vector x = testinghelpers::get_random_vector( -10, 10, n, incx ); + if (alpha == testinghelpers::ZERO()) + testinghelpers::set_vector( n, incx, x.data(), testinghelpers::aocl_extreme() ); //---------------------------------------------------------- // Call reference implementation to get ref results. @@ -64,7 +66,7 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub //---------------------------------------------------------- // Compute component-wise error. //---------------------------------------------------------- - computediff( "x", n, x.data(), x_ref.data(), incx, thresh ); + computediff( "x", n, x.data(), x_ref.data(), incx, thresh, true ); } /** diff --git a/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp index 59d875bda..8e2545597 100644 --- a/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/zdscalv_generic.cpp @@ -82,59 +82,187 @@ TEST_P( zdscalvGeneric, API ) // bli_zdscal not present in BLIS #ifndef TEST_BLIS_TYPED -// Black box testing for zdscal. -// Tests with unit-positive increment. + +// Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - unitPositiveIncrement, + unitPositiveIncrementSmall, zdscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), + ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Range(gtint_t(1), gtint_t(101), 1), // incx: stride of x vector. - ::testing::Values(gtint_t(1)), + ::testing::Values( + gtint_t(1) + ), // alpha: value of scalar. ::testing::Values( - double(-5.1), - double( 0.0), - double( 7.3) + double( 7.0), + double(-3.0) ) ), - (::scalvGenericPrint()) + (::scalvGenericPrint()) ); - -// Tests for non-unit increments. +// Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - nonUnitPositiveIncrement, + unitPositiveIncrementLarge, zdscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), + ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + double( 7.0), + double(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementSmall, + zdscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(9), 1), // incx: stride of x vector. ::testing::Values( gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 7.0), + double(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + zdscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 7.0), + double(-3.0) + ) + ), + (::scalvGenericPrint()) + ); + +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + zdscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + double( 0.0) + ) + ), + (::scalvGenericPrint()) + ); + +// Test for negative increments. +// Only test very few cases as sanity check. +// We can modify the values using implementantion details. +INSTANTIATE_TEST_SUITE_P( + NegativeIncrements, + zdscalvGeneric, + ::testing::Combine( + ::testing::Values('n'), // n: use x, c: use conj(x) + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x + ::testing::Values(3) // alpha + ), + (::scalvGenericPrint()) + ); + +#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC) +INSTANTIATE_TEST_SUITE_P( + AOCLDynamic, + zdscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values( + gtint_t( 10000), // nt_ideal = 1 + gtint_t( 20000), // nt_ideal = 4 + gtint_t( 486919), // nt_ideal = 8 + gtint_t( 1000000), // nt_ideal = 8 + gtint_t( 2500000), // nt_ideal = 12 + gtint_t( 5000000), // nt_ideal = 32 + gtint_t( 7000000) // nt_ideal = max_available + ), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), gtint_t(3) ), // alpha: value of scalar. ::testing::Values( - double(-5.1), - double( 0.0), - double( 7.3) + double( 7.0) ) ), - (::scalvGenericPrint()) + (::scalvGenericPrint()) ); +INSTANTIATE_TEST_SUITE_P( + AOCLDynamicAlphaZero, + zdscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values( + gtint_t( 486919), // nt_ideal = 8 + gtint_t( 7000000) // nt_ideal = max_available + ), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(3) + ), + // alpha: value of scalar. + ::testing::Values( + double( 0.0) + ) + ), + (::scalvGenericPrint()) + ); +#endif + #endif // not TEST_BLIS_TYPED diff --git a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp index bf7182d83..20635564b 100644 --- a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp +++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp @@ -78,26 +78,22 @@ TEST_P( zscalvGeneric, API ) test_scalv( conj_alpha, n, incx, alpha, thresh ); } -// Black box testing for zscal. -// Tests with unit-positive increment. +// Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - unitPositiveIncrement, + unitPositiveIncrementSmall, zscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), + ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Range(gtint_t(1), gtint_t(101), 1), // incx: stride of x vector. - ::testing::Values(gtint_t(1)), + ::testing::Values( + gtint_t(1) + ), // alpha: value of scalar. ::testing::Values( dcomplex{-5.1, -7.3}, - dcomplex{ 0.0, 0.0}, dcomplex{ 1.0, 1.0}, dcomplex{ 7.3, 5.1} ) @@ -105,32 +101,131 @@ INSTANTIATE_TEST_SUITE_P( (::scalvGenericPrint()) ); - -// Test for non-unit increments. +// Black box testing for generic use of dscal. INSTANTIATE_TEST_SUITE_P( - nonUnitPositiveIncrement, + unitPositiveIncrementLarge, zscalvGeneric, ::testing::Combine( // conj(alpha): uses n (no_conjugate) since it is real. - ::testing::Values('n' -#ifdef TEST_BLIS_TYPED - , 'c' // this option is BLIS-api specific. -#endif - ), + ::testing::Values('n'), // m: size of vector. - ::testing::Range(gtint_t(10), gtint_t(101), 10), + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1) + ), + // alpha: value of scalar. + ::testing::Values( + dcomplex{-5.1, -7.3}, + dcomplex{ 1.0, 1.0}, + dcomplex{ 7.3, 5.1} + ) + ), + (::scalvGenericPrint()) + ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementSmall, + zscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(9), 1), // incx: stride of x vector. ::testing::Values( gtint_t(2), - gtint_t(3) + gtint_t(41) ), // alpha: value of scalar. ::testing::Values( dcomplex{-5.1, -7.3}, - dcomplex{ 0.0, 0.0}, dcomplex{ 1.0, 1.0}, dcomplex{ 7.3, 5.1} ) ), (::scalvGenericPrint()) ); + +INSTANTIATE_TEST_SUITE_P( + nonUnitPositiveIncrementLarge, + zscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)), + // incx: stride of x vector. + ::testing::Values( + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + dcomplex{-5.1, -7.3}, + dcomplex{ 1.0, 1.0}, + dcomplex{ 7.3, 5.1} + ) + ), + (::scalvGenericPrint()) + ); + +#ifndef TEST_BLIS_TYPED +// alpha=0 testing only for BLAS and CBLAS as +// BLIS uses setv and won't propagate Inf and NaNs +INSTANTIATE_TEST_SUITE_P( + alphaZero, + zscalvGeneric, + ::testing::Combine( + // conj(alpha): uses n (no_conjugate) since it is real. + ::testing::Values('n'), + // m: size of vector. + ::testing::Range(gtint_t(1), gtint_t(101), 1), + // incx: stride of x vector. + ::testing::Values( + gtint_t(1), + gtint_t(2), + gtint_t(41) + ), + // alpha: value of scalar. + ::testing::Values( + dcomplex{ 0.0, 0.0} + ) + ), + (::scalvGenericPrint()) + ); +#endif + +#ifdef TEST_BLIS_TYPED +// Test when conjugate of x is used as an argument. This option is BLIS-api specific. +// Only test very few cases as sanity check since conj(x) = x for real types. +// We can modify the values using implementantion details. +INSTANTIATE_TEST_SUITE_P( + conjalpha, + zscalvGeneric, + ::testing::Combine( + ::testing::Values('c'), // c: use conjugate + ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(1)), // stride size for x + ::testing::Values(dcomplex{ 7.3, 5.1}) // alpha + ), + (::scalvGenericPrint()) + ); +#endif + +#ifndef TEST_BLIS_TYPED +// Test for negative increments. +// Only test very few cases as sanity check. +// We can modify the values using implementantion details. +INSTANTIATE_TEST_SUITE_P( + NegativeIncrements, + zscalvGeneric, + ::testing::Combine( + ::testing::Values('n'), // n: use x, c: use conj(x) + ::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10. + ::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x + ::testing::Values(dcomplex{ 7.3, 5.1}) // alpha + ), + (::scalvGenericPrint()) + ); +#endif diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c index fa337c247..34d6b161c 100644 --- a/kernels/zen/1/bli_scalv_zen_int.c +++ b/kernels/zen/1/bli_scalv_zen_int.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without @@ -80,9 +80,11 @@ void bli_sscalv_zen_int if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return; // If alpha is zero, use setv (in case y contains NaN or Inf). - if ( PASTEMAC(s,eq0)( *alpha ) ) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 ) { float* zero = bli_s0; + if (cntx == NULL) cntx = bli_gks_query_cntx(); ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f @@ -96,10 +98,12 @@ void bli_sscalv_zen_int return; } + dim_t n0 = bli_abs(n); + // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. - n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); - n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); + n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll ); + n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code @@ -107,7 +111,7 @@ void bli_sscalv_zen_int if ( incx != 1 ) { n_viter = 0; - n_left = n; + n_left = n0; } // Initialize local pointers. @@ -178,10 +182,11 @@ void bli_dscalv_zen_int // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return; - // If alpha is zero, use setv (in case y contains NaN or Inf). - if ( PASTEMAC(d,eq0)( *alpha ) ) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 ) { double* zero = bli_d0; + if (cntx == NULL) cntx = bli_gks_query_cntx(); dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f @@ -195,10 +200,12 @@ void bli_dscalv_zen_int return; } + dim_t n0 = bli_abs(n); + // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. - n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); - n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); + n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll ); + n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code @@ -206,7 +213,7 @@ void bli_dscalv_zen_int if ( incx != 1 ) { n_viter = 0; - n_left = n; + n_left = n0; } // Initialize local pointers. diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c index 463ab9ae0..ab5e46af0 100644 --- a/kernels/zen/1/bli_scalv_zen_int10.c +++ b/kernels/zen/1/bli_scalv_zen_int10.c @@ -60,8 +60,8 @@ void bli_sscalv_zen_int10 // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return; - // If alpha is zero, use setv. - if ( PASTEMAC(s,eq0)( *alpha ) ) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 ) { float* zero = bli_s0; if ( cntx == NULL ) cntx = bli_gks_query_cntx(); @@ -78,6 +78,8 @@ void bli_sscalv_zen_int10 return; } + dim_t n0 = bli_abs(n); + // Initialize local pointers. x0 = x; @@ -88,11 +90,11 @@ void bli_sscalv_zen_int10 dim_t option; // Unroll and the loop used is picked based on the input size. - if( n < 300) + if( n0 < 300) { option = 2; } - else if( n < 500) + else if( n0 < 500) { option = 1; } @@ -105,7 +107,7 @@ void bli_sscalv_zen_int10 { case 0: - for ( ; (i + 127) < n; i += 128 ) + for ( ; (i + 127) < n0; i += 128 ) { //Load the input values xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); @@ -175,7 +177,7 @@ void bli_sscalv_zen_int10 case 1 : - for ( ; (i + 95) < n; i += 96 ) + for ( ; (i + 95) < n0; i += 96 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); @@ -227,7 +229,7 @@ void bli_sscalv_zen_int10 case 2: - for ( ; (i + 47) < n; i += 48 ) + for ( ; (i + 47) < n0; i += 48 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); @@ -256,7 +258,7 @@ void bli_sscalv_zen_int10 x0 += 6*n_elem_per_reg; } - for ( ; (i + 23) < n; i += 24 ) + for ( ; (i + 23) < n0; i += 24 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); @@ -273,7 +275,7 @@ void bli_sscalv_zen_int10 x0 += 3*n_elem_per_reg; } - for ( ; (i + 7) < n; i += 8 ) + for ( ; (i + 7) < n0; i += 8 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); @@ -284,7 +286,7 @@ void bli_sscalv_zen_int10 x0 += 1*n_elem_per_reg; } - for ( ; (i + 0) < n; i += 1 ) + for ( ; (i + 0) < n0; i += 1 ) { *x0 *= *alpha; @@ -296,7 +298,7 @@ void bli_sscalv_zen_int10 { const float alphac = *alpha; - for ( ; i < n; ++i ) + for ( ; i < n0; ++i ) { *x0 *= alphac; @@ -329,8 +331,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return; - // If alpha is zero, use setv. - if ( PASTEMAC(d,eq0)( *alpha ) ) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 ) { double* zero = bli_d0; if ( cntx == NULL ) cntx = bli_gks_query_cntx(); @@ -348,6 +350,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 return; } + dim_t n0 = bli_abs(n); + // Initialize local pointers. x0 = x; @@ -358,11 +362,11 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 dim_t option; // Unroll and the loop used is picked based on the input size. - if(n < 200) + if(n0 < 200) { option = 2; } - else if(n < 500) + else if(n0 < 500) { option = 1; } @@ -375,7 +379,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 { case 0: - for (; (i + 63) < n; i += 64 ) + for (; (i + 63) < n0; i += 64 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); @@ -440,7 +444,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 x0 += 16*n_elem_per_reg; } - for (; (i + 47) < n; i += 48 ) + for (; (i + 47) < n0; i += 48 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); @@ -492,7 +496,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 case 1: - for (; (i + 31) < n; i += 32 ) + for (; (i + 31) < n0; i += 32 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); @@ -529,7 +533,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 case 2: - for ( ; (i + 11) < n; i += 12 ) + for ( ; (i + 11) < n0; i += 12 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); @@ -546,7 +550,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 x0 += 3*n_elem_per_reg; } - for ( ; (i + 3) < n; i += 4 ) + for ( ; (i + 3) < n0; i += 4 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); @@ -557,7 +561,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 x0 += 1*n_elem_per_reg; } - for ( ; (i + 0) < n; i += 1 ) + for ( ; (i + 0) < n0; i += 1 ) { *x0 *= *alpha; @@ -569,7 +573,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10 { const double alphac = *alpha; - for ( ; i < n; ++i ) + for ( ; i < n0; ++i ) { *x0 *= alphac; @@ -587,6 +591,30 @@ void bli_zdscalv_zen_int10 cntx_t* restrict cntx ) { + // If the vector dimension is zero, or if alpha is unit, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha )) return; + + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 ) + { + // Expert interface of setv is invoked when alpha is zero + dcomplex *zero = bli_z0; + + /* When alpha is zero all the element in x are set to zero */ + PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n, + zero, + x, incx, + cntx, + NULL); + + return; + } + + dim_t n0 = bli_abs(n); + dim_t i = 0; const dim_t n_elem_per_reg = 4; // number of elements per register @@ -607,7 +635,7 @@ void bli_zdscalv_zen_int10 alphav = _mm256_broadcast_sd( &alphac ); - for ( ; ( i + 29 ) < n; i += 30 ) + for ( ; ( i + 29 ) < n0; i += 30 ) { xv[0] = _mm256_loadu_pd( x0 ); xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); @@ -660,7 +688,7 @@ void bli_zdscalv_zen_int10 x0 += 15 * n_elem_per_reg; } - for ( ; ( i + 23 ) < n; i += 24 ) + for ( ; ( i + 23 ) < n0; i += 24 ) { xv[0] = _mm256_loadu_pd( x0 ); xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); @@ -704,7 +732,7 @@ void bli_zdscalv_zen_int10 x0 += 12 * n_elem_per_reg; } - for ( ; ( i + 15 ) < n; i += 16 ) + for ( ; ( i + 15 ) < n0; i += 16 ) { xv[0] = _mm256_loadu_pd( x0 ); xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); @@ -736,7 +764,7 @@ void bli_zdscalv_zen_int10 x0 += 8 * n_elem_per_reg; } - for ( ; ( i + 7 ) < n; i += 8 ) + for ( ; ( i + 7 ) < n0; i += 8 ) { xv[0] = _mm256_loadu_pd( x0 ); xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); @@ -756,7 +784,7 @@ void bli_zdscalv_zen_int10 x0 += 4 * n_elem_per_reg; } - for ( ; ( i + 3 ) < n; i += 4 ) + for ( ; ( i + 3 ) < n0; i += 4 ) { xv[0] = _mm256_loadu_pd( x0 ); xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg ); @@ -770,7 +798,7 @@ void bli_zdscalv_zen_int10 x0 += 2 * n_elem_per_reg; } - for ( ; ( i + 1 ) < n; i += 2 ) + for ( ; ( i + 1 ) < n0; i += 2 ) { xv[0] = _mm256_loadu_pd( x0 ); @@ -795,7 +823,7 @@ void bli_zdscalv_zen_int10 alpha_reg = _mm_set1_pd((*alpha).real); - for (; i < n; ++i) + for (; i < n0; ++i) { x_vec = _mm_loadu_pd(x0); @@ -816,24 +844,14 @@ void bli_cscalv_zen_int cntx_t* restrict cntx ) { - /* - Undefined behaviour - ------------------- + // If the vector dimension is zero, or if alpha is unit, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return; - 1. This layer is not BLAS complaint and the kernel results in - undefined behaviour when n <= 0 and incx <= 1. The expectation - is that the application/higher-layer invoking this layer should - the arg checks. - */ - // if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha)) - // return; - - // To Do: This call to SETV needs to be removed for BLAS compliance - // Currently removing this is resulting in ZHERK failures - if (PASTEMAC(c, eq0)(*alpha)) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 ) { // Expert interface of setv is invoked when alpha is zero - scomplex *zero = PASTEMAC(c, 0); + scomplex *zero = bli_c0; /* When alpha is zero all the element in x are set to zero */ PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF) @@ -848,6 +866,8 @@ void bli_cscalv_zen_int return; } + dim_t n0 = bli_abs(n); + dim_t i = 0; scomplex alpha_conj; float *x0 = (float *)x; @@ -897,7 +917,7 @@ void bli_cscalv_zen_int and then store */ - for (; (i + 15) < n; i += 16) + for (; (i + 15) < n0; i += 16) { x_vec_ymm[0] = _mm256_loadu_ps(x0); x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg); @@ -927,7 +947,7 @@ void bli_cscalv_zen_int x0 += 4 * n_elem_per_reg; } - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { x_vec_ymm[0] = _mm256_loadu_ps(x0); x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg); @@ -947,7 +967,7 @@ void bli_cscalv_zen_int x0 += 2 * n_elem_per_reg; } - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { x_vec_ymm[0] = _mm256_loadu_ps(x0); @@ -969,7 +989,7 @@ void bli_cscalv_zen_int _mm256_zeroupper(); } - for (; i < n; i++) + for (; i < n0; i++) { float x_real, x_imag; x_real = real * (*x0) - imag * (*(x0 + 1)); @@ -991,24 +1011,14 @@ void bli_zscalv_zen_int cntx_t* restrict cntx ) { - /* - Undefined behaviour - ------------------- + // If the vector dimension is zero, or if alpha is unit, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return; - 1. This layer is not BLAS complaint and the kernel results in - undefined behaviour when n <= 0 and incx <= 1. The expectation - is that the application/higher-layer invoking this layer should - the arg checks. - */ - // if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha)) - // return; - - // To Do: This call to SETV needs to be removed for BLAS compliance - // Currently removing this is resulting in ZHERK failures - if (PASTEMAC(z, eq0)(*alpha)) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 ) { // Expert interface of setv is invoked when alpha is zero - dcomplex *zero = PASTEMAC(z, 0); + dcomplex *zero = bli_z0; /* When alpha is zero all the element in x are set to zero */ PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF) @@ -1023,6 +1033,8 @@ void bli_zscalv_zen_int return; } + dim_t n0 = bli_abs(n); + dim_t i = 0; dcomplex alpha_conj; double *x0 = (double *)x; @@ -1033,8 +1045,8 @@ void bli_zscalv_zen_int double real = alpha_conj.real; double imag = alpha_conj.imag; - /*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/ - if (incx == 1 && n >= 2) + /*When incx is 1 and n0 >= 2 it is possible to use AVX2 instructions*/ + if (incx == 1 && n0 >= 2) { dim_t const n_elem_per_reg = 4; @@ -1072,7 +1084,7 @@ void bli_zscalv_zen_int and then store */ - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { x_vec_ymm[0] = _mm256_loadu_pd(x0); x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg); @@ -1106,7 +1118,7 @@ void bli_zscalv_zen_int x0 += 4 * n_elem_per_reg; } - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { x_vec_ymm[0] = _mm256_loadu_pd(x0); x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg); @@ -1126,7 +1138,7 @@ void bli_zscalv_zen_int x0 += 2 * n_elem_per_reg; } - for (; (i + 1) < n; i += 2) + for (; (i + 1) < n0; i += 2) { x_vec_ymm[0] = _mm256_loadu_pd(x0); @@ -1155,7 +1167,7 @@ void bli_zscalv_zen_int alpha_real_xmm = _mm_set1_pd(real); alpha_imag_xmm = _mm_set1_pd(imag); - for (; i < n; i++) + for (; i < n0; i++) { x_vec_xmm = _mm_loadu_pd(x0); diff --git a/kernels/zen4/1/bli_scalv_zen_int_avx512.c b/kernels/zen4/1/bli_scalv_zen_int_avx512.c index 4d9a05794..a2143a524 100644 --- a/kernels/zen4/1/bli_scalv_zen_int_avx512.c +++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c @@ -61,13 +61,14 @@ Deviation from BLAS -------------------- - None + Setv is used when alpha=0 unless a negative value of n is supplied. + This only occurs in calls from BLAS and CBLAS scal APIs. Undefined behaviour ------------------- - 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation - is that these are standard BLAS exceptions and should be handled in a higher layer. + None + */ void bli_sscalv_zen_int_avx512 ( @@ -78,6 +79,30 @@ void bli_sscalv_zen_int_avx512 cntx_t *restrict cntx ) { + // If the vector dimension is zero, or if alpha is unit, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return; + + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 ) + { + float *zero = bli_s0; + if (cntx == NULL) cntx = bli_gks_query_cntx(); + ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SETV_KER, cntx); + + f + ( + BLIS_NO_CONJUGATE, + n, + zero, + x, incx, + cntx + ); + + return; + } + + dim_t n0 = bli_abs(n); + dim_t i = 0; float *restrict x0 = x; @@ -89,7 +114,7 @@ void bli_sscalv_zen_int_avx512 __m512 xv[8], alphav; alphav = _mm512_set1_ps(*alpha); - for (i = 0; (i + 127) < n; i += 128) + for (i = 0; (i + 127) < n0; i += 128) { // Loading the input values xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg); @@ -125,7 +150,7 @@ void bli_sscalv_zen_int_avx512 x0 += 8 * n_elem_per_reg; } - for (; (i + 63) < n; i += 64) + for (; (i + 63) < n0; i += 64) { // Loading the input values xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg); @@ -147,7 +172,7 @@ void bli_sscalv_zen_int_avx512 x0 += 4 * n_elem_per_reg; } - for (; (i + 31) < n; i += 32) + for (; (i + 31) < n0; i += 32) { // Loading the input values xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg); @@ -163,7 +188,7 @@ void bli_sscalv_zen_int_avx512 x0 += 2 * n_elem_per_reg; } - for (; (i + 15) < n; i += 16) + for (; (i + 15) < n0; i += 16) { // Loading the input values xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg); @@ -176,7 +201,7 @@ void bli_sscalv_zen_int_avx512 x0 += n_elem_per_reg; } - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { // Loading the input values __m256 x_vec = _mm256_loadu_ps(x0); @@ -198,7 +223,7 @@ void bli_sscalv_zen_int_avx512 */ _mm256_zeroupper(); - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { // Loading the input values __m128 x_vec = _mm_loadu_ps(x0); @@ -215,7 +240,7 @@ void bli_sscalv_zen_int_avx512 const float alphac = *alpha; - for (; i < n; ++i) + for (; i < n0; ++i) { *x0 *= alphac; @@ -252,13 +277,14 @@ void bli_sscalv_zen_int_avx512 Deviation from BLAS -------------------- - None + Setv is used when alpha=0 unless a negative value of n is supplied. + This only occurs in calls from BLAS and CBLAS scal APIs. Undefined behaviour ------------------- - 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation - is that these are standard BLAS exceptions and should be handled in a higher layer. + None + */ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 ( @@ -270,11 +296,10 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 ) { // If the vector dimension is zero, or if alpha is unit, return early. - if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha)) - return; + if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return; - // If alpha is zero, use setv. - if (PASTEMAC(d, eq0)(*alpha)) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 ) { double *zero = bli_d0; if (cntx == NULL) cntx = bli_gks_query_cntx(); @@ -292,6 +317,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 return; } + dim_t n0 = bli_abs(n); + dim_t i = 0; double *restrict x0; @@ -307,7 +334,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 alphav = _mm512_set1_pd(*alpha); __m512d xv[8]; - for (i = 0; (i + 63) < n; i += 64) + for (i = 0; (i + 63) < n0; i += 64) { // Loading the input values xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg); @@ -343,7 +370,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 x0 += 8 * n_elem_per_reg; } - for (; (i + 31) < n; i += 32) + for (; (i + 31) < n0; i += 32) { // Loading the input values xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg); @@ -365,7 +392,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 x0 += 4 * n_elem_per_reg; } - for (; (i + 15) < n; i += 16) + for (; (i + 15) < n0; i += 16) { // Loading the input values xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg); @@ -381,7 +408,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 x0 += 2 * n_elem_per_reg; } - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { // Loading the input values xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg); @@ -394,7 +421,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 x0 += n_elem_per_reg; } - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { // Loading the input values __m256d x_vec = _mm256_loadu_pd(x0); @@ -416,7 +443,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 */ _mm256_zeroupper(); - for (; (i + 1) < n; i += 2) + for (; (i + 1) < n0; i += 2) { // Loading the input values __m128d x_vec = _mm_loadu_pd(x0); @@ -433,7 +460,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 const double alphac = *alpha; - for (; i < n; ++i) + for (; i < n0; ++i) { *x0 *= alphac; @@ -468,13 +495,14 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512 Deviation from BLAS -------------------- - None + Setv is used when alpha=0 unless a negative value of n is supplied. + This only occurs in calls from BLAS and CBLAS scal APIs. Undefined behaviour ------------------- - 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation - is that these are standard BLAS exceptions and should be handled in a higher layer. + None + */ void bli_zdscalv_zen_int_avx512 ( @@ -491,6 +519,31 @@ void bli_zdscalv_zen_int_avx512 alpha is passed as double complex to adhere to function pointer definition in BLIS */ + + // If the vector dimension is zero, or if alpha is unit, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return; + + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 ) + { + // Expert interface of setv is invoked when alpha is zero + dcomplex *zero = bli_z0; + + /* When alpha is zero all the element in x are set to zero */ + PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF) + ( + BLIS_NO_CONJUGATE, + n, + zero, + x, incx, + cntx, + NULL); + + return; + } + + dim_t n0 = bli_abs(n); + const double alphac = (*alpha).real; dim_t i = 0; @@ -504,7 +557,7 @@ void bli_zdscalv_zen_int_avx512 alphav = _mm512_set1_pd(alphac); - for (; (i + 15) < n; i += 16) + for (; (i + 15) < n0; i += 16) { xv[0] = _mm512_loadu_pd(x0); xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg); @@ -524,7 +577,7 @@ void bli_zdscalv_zen_int_avx512 x0 += 4 * n_elem_per_reg; } - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { xv[0] = _mm512_loadu_pd(x0); xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg); @@ -538,7 +591,7 @@ void bli_zdscalv_zen_int_avx512 x0 += 2 * n_elem_per_reg; } - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { xv[0] = _mm512_loadu_pd(x0); @@ -549,7 +602,7 @@ void bli_zdscalv_zen_int_avx512 x0 += n_elem_per_reg; } - for (; (i + 1) < n; i += 2) + for (; (i + 1) < n0; i += 2) { __m256d xv = _mm256_loadu_pd(x0); @@ -576,7 +629,7 @@ void bli_zdscalv_zen_int_avx512 alpha_reg = _mm_set1_pd((*alpha).real); - for (; i < n; ++i) + for (; i < n0; ++i) { x_vec = _mm_loadu_pd(x0); @@ -674,8 +727,8 @@ void bli_zdscalv_zen_int_avx512 Undefined behaviour ------------------- - 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation - is that these are standard BLAS exceptions and should be handled in a higher layer. + None + */ void bli_cscalv_zen_int_avx512 ( @@ -689,14 +742,11 @@ void bli_cscalv_zen_int_avx512 // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return; - /** - * @note Currently this kernel is not BLAS compliant. For BLAS compliance, - * the below call to SETV needs to be removed. - */ - if ( PASTEMAC(c,eq0)(*alpha) ) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 ) { // Expert interface of setv is invoked when alpha is zero - scomplex *zero = PASTEMAC(c,0); + scomplex *zero = bli_c0; /* When alpha is zero all the element in x are set to zero */ PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF) @@ -712,6 +762,8 @@ void bli_cscalv_zen_int_avx512 return; } + dim_t n0 = bli_abs(n); + dim_t i = 0; scomplex alpha_conj; float* restrict x0 = (float*) x; @@ -760,7 +812,7 @@ void bli_cscalv_zen_int_avx512 */ // Processing 96 scomplex elements (192 floats) per iteration - for ( ; (i + 95) < n; i += 96 ) + for ( ; (i + 95) < n0; i += 96 ) { __m512 xv[12], inter[12]; @@ -776,7 +828,7 @@ void bli_cscalv_zen_int_avx512 } // Processing 64 scomplex elements (128 floats) per iteration - for ( ; (i + 63) < n; i += 64 ) + for ( ; (i + 63) < n0; i += 64 ) { __m512 xv[8], inter[8]; @@ -790,7 +842,7 @@ void bli_cscalv_zen_int_avx512 } // Processing 32 scomplex elements (64 floats) per iteration - for ( ; (i + 31) < n; i += 32 ) + for ( ; (i + 31) < n0; i += 32 ) { __m512 xv[4], inter[4]; @@ -802,7 +854,7 @@ void bli_cscalv_zen_int_avx512 } // Processing 16 scomplex elements (32 floats) per iteration - for ( ; (i + 15) < n; i += 16 ) + for ( ; (i + 15) < n0; i += 16 ) { __m512 xv[2], inter[2]; @@ -842,7 +894,7 @@ void bli_cscalv_zen_int_avx512 } // Processing 8 scomplex elements (16 floats) per iteration - for ( ; (i + 7) < n; i += 8 ) + for ( ; (i + 7) < n0; i += 8 ) { __m512 xv[1], inter[1]; @@ -877,21 +929,23 @@ void bli_cscalv_zen_int_avx512 } // Processing remaining elements, if any. - if ( i < n ) { + if ( i < n0 ) + { // Setting the mask bit based on remaining elements. // Since each scomplex element corresponds to 2 floats, - // we need to load and store 2*(n-i) elements. + // we need to load and store 2*(n0-i) elements. - __mmask16 mask = ( 1 << ( 2 * ( n - i ) ) ) - 1; + __mmask16 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1; + + __m512 xv, temp; - __m512 xv, inter; xv = _mm512_maskz_loadu_ps( mask, x0 ); - inter = _mm512_permute_ps( xv, 0xB1 ); + temp = _mm512_permute_ps( xv, 0xB1 ); - inter = _mm512_mul_ps( alphaIv, inter ); + temp = _mm512_mul_ps( alphaIv, temp ); - xv = _mm512_fmaddsub_ps( alphaRv, xv, inter ); + xv = _mm512_fmaddsub_ps( alphaRv, xv, temp ); _mm512_mask_storeu_ps( x0, mask, xv ); } @@ -902,7 +956,7 @@ void bli_cscalv_zen_int_avx512 const float alphaI = alpha_conj.imag; float x0R, x0I; - for (; i < n; ++i) + for (; i < n0; ++i) { x0R = *(x0); x0I = *(x0 + 1); @@ -942,13 +996,14 @@ void bli_cscalv_zen_int_avx512 Deviation from BLAS -------------------- - None + Setv is used when alpha=0 unless a negative value of n is supplied. + This only occurs in calls from BLAS and CBLAS scal APIs. Undefined behaviour ------------------- - 1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation - is that these are standard BLAS exceptions and should be handled in a higher layer. + None + */ void bli_zscalv_zen_int_avx512 ( @@ -960,17 +1015,13 @@ void bli_zscalv_zen_int_avx512 ) { // If the vector dimension is zero, or if alpha is unit, return early. - if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha)) - return; + if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return; - /** - * @note Currently this kernel is not BLAS compliant. For BLAS compliance, - * the below call to SETV needs to be removed. - */ - if (PASTEMAC(z, eq0)(*alpha)) + // If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative). + if (PASTEMAC(z,eq0)( *alpha ) && n > 0 ) { // Expert interface of setv is invoked when alpha is zero - dcomplex *zero = PASTEMAC(z, 0); + dcomplex *zero = bli_z0; /* When alpha is zero all the element in x are set to zero */ PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF) @@ -985,6 +1036,8 @@ void bli_zscalv_zen_int_avx512 return; } + dim_t n0 = bli_abs(n); + dim_t i = 0; dcomplex alpha_conj; double *restrict x0 = (double *)x; @@ -1022,7 +1075,7 @@ void bli_zscalv_zen_int_avx512 */ // Processing 48 dcomplex elements per iteration. - for (; (i + 47) < n; i += 48) + for (; (i + 47) < n0; i += 48) { __m512d xv[12], temp[12]; @@ -1116,7 +1169,7 @@ void bli_zscalv_zen_int_avx512 } // Processing 32 dcomplex elements per iteration. - for (; (i + 31) < n; i += 32) + for (; (i + 31) < n0; i += 32) { __m512d xv[8], temp[8]; xv[0] = _mm512_loadu_pd(x0); @@ -1173,7 +1226,7 @@ void bli_zscalv_zen_int_avx512 } // Processing 16 dcomplex elements per iteration. - for (; (i + 15) < n; i += 16) + for (; (i + 15) < n0; i += 16) { __m512d xv[4], temp[4]; xv[0] = _mm512_loadu_pd(x0); @@ -1205,7 +1258,7 @@ void bli_zscalv_zen_int_avx512 } // Processing 8 dcomplex elements per iteration. - for (; (i + 7) < n; i += 8) + for (; (i + 7) < n0; i += 8) { __m512d xv[2], temp[2]; xv[0] = _mm512_loadu_pd(x0); @@ -1227,7 +1280,7 @@ void bli_zscalv_zen_int_avx512 } // Processing 4 dcomplex elements per iteration. - for (; (i + 3) < n; i += 4) + for (; (i + 3) < n0; i += 4) { __m512d xv, temp; xv = _mm512_loadu_pd(x0); @@ -1244,23 +1297,24 @@ void bli_zscalv_zen_int_avx512 } // Processing the remainder elements. - if( i < n ) + if( i < n0 ) { // Setting the mask bit based on remaining elements // Since each dcomplex elements corresponds to 2 doubles - // we need to load and store 2*(m-i) elements. - __mmask8 mask = (1 << (2 * (n-i)) ) - 1; + // we need to load and store 2*(n0-i) elements. + + __mmask8 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1; __m512d xv, temp, zero; zero = _mm512_setzero_pd(); xv = _mm512_mask_loadu_pd( zero, mask, x0 ); - temp = _mm512_permute_pd(xv, 0x55); + temp = _mm512_permute_pd( xv, 0x55 ); - temp = _mm512_mul_pd(alphaIv, temp); + temp = _mm512_mul_pd( alphaIv, temp ); - xv = _mm512_fmaddsub_pd(alphaRv, xv, temp); + xv = _mm512_fmaddsub_pd( alphaRv, xv, temp ); _mm512_mask_storeu_pd( x0, mask, xv ); } @@ -1272,7 +1326,7 @@ void bli_zscalv_zen_int_avx512 alphaRv = _mm_loaddup_pd(&alphaR); alphaIv = _mm_loaddup_pd(&alphaI); - for (; i < n; ++i) + for (; i < n0; ++i) { x_vec = _mm_loadu_pd(x0); diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c index 4945b637b..29d55e626 100644 --- a/ref_kernels/1/bli_scalv_ref.c +++ b/ref_kernels/1/bli_scalv_ref.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \ \ /* If alpha is zero, use setv. */ \ - if ( PASTEMAC(ch,eq0)( *alpha ) ) \ + if ( PASTEMAC(ch,eq0)( *alpha ) && n > 0) \ { \ ctype* zero = PASTEMAC(ch,0); \ \ @@ -70,6 +71,8 @@ void PASTEMAC3(ch,opname,arch,suf) \ ); \ return; \ } \ +\ + dim_t n0 = bli_abs(n); \ \ ctype alpha_conj; \ \ @@ -78,14 +81,14 @@ void PASTEMAC3(ch,opname,arch,suf) \ if ( incx == 1 ) \ { \ PRAGMA_SIMD \ - for ( dim_t i = 0; i < n; ++i ) \ + for ( dim_t i = 0; i < n0; ++i ) \ { \ PASTEMAC(ch,scals)( alpha_conj, x[i] ); \ } \ } \ else \ { \ - for ( dim_t i = 0; i < n; ++i ) \ + for ( dim_t i = 0; i < n0; ++i ) \ { \ PASTEMAC(ch,scals)( alpha_conj, *x ); \ \