SCALV alpha=zero BLAS compliance

SCALV is used directly by BLAS, CBLAS and BLIS scal{v} APIs but
also within many other APIs to handle special cases. In general
it is preferred to use SETV when alpha=0, but BLAS and CBLAS
continue to multiple all vector element by alpha. This has
different behaviour for propagating NaNs or Infs.

Changes in this commit:
- Standardize early returns from SCALV reference and optimized
  kernels.
- User supplied N<0 is handled at the top level API layer. Use
  negative values of N in kernel calls to signify that SETV
  should _not_ be used when alpha=0. This should only be
  required in SCALV.
- Include serial threshold in zdscal (as in dscal) to reduce
  overhead for small problem sizes.
- Code tidying to make different variants more consistent.
- More standardization of tests in SCALV gtestsuite programs.
- Remove scalv_extreme_cases.cpp as it is now redundant.

AMD-Internal: [CPUPL-4415]
Change-Id: I42e98875ceaea224cc98d0cdfe0133c9abc3edae
(cherry picked from commit a07e041b1f)
This commit is contained in:
Edward Smyth
2024-09-05 09:31:34 -04:00
parent 601e4fbb87
commit 4eabb5cd2e
15 changed files with 1297 additions and 595 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCSCAL
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
\
void PASTEF772S(chx,cha,blasname) \
( \
@@ -50,44 +50,42 @@ void PASTEF772S(chx,cha,blasname) \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_x alpha_cast; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
if (*n == 0 || alpha == NULL) { \
dim_t n0 = (dim_t)(*n); \
ftype_x *x0 = x; \
inc_t incx0 = (inc_t)(*incx); \
\
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return ; \
} \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
that is, we just always sub-optimally implement those cases
by casting alpha to ctype_x (potentially the complex domain) and
using the homogeneous datatype instance according to that type. */ \
ftype_x alpha_cast; \
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
\
/* Call BLIS interface. */ \
/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
-n0, \
&alpha_cast, \
x0, incx0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}\

View File

@@ -50,13 +50,16 @@
1. When alpha == NaN - Propogate the NaN to the vector
2. When alpha == 0 - Perform the SCALV operation completely and don't use setv.
As SCALV kernels are used in many other BLAS APIs where we want setv to be
used in this scenario, here we call the kernels with n=-n to signify that
setv should not be used.
*/
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCSCAL
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
\
void PASTEF772S(chx,cha,blasname) \
( \
@@ -66,55 +69,42 @@ void PASTEF772S(chx,cha,blasname) \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_x alpha_cast; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
if (*n == 0 || alpha == NULL) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
return ; \
} \
dim_t n0 = (dim_t)(*n); \
ftype_x *x0 = x; \
inc_t incx0 = (inc_t)(*incx); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are less than or equal to zero, return. */ \
if ( (*incx) <= 0 ) { \
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return ; \
} else { \
incx0 = ( inc_t )(*incx); \
x0 = (x); \
} \
\
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
that is, we just always sub-optimally implement those cases
by casting alpha to ctype_x (potentially the complex domain) and
using the homogeneous datatype instance according to that type. */ \
ftype_x alpha_cast; \
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
\
/* If alpha is a unit scalar, return early. */ \
if ( PASTEMAC(c, eq1)(alpha_cast) ) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
return ; \
} \
\
/* Call BLIS interface. */ \
/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
-n0, \
&alpha_cast, \
x0, incx0, \
NULL, \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}\
@@ -139,82 +129,72 @@ void sscal_blis_impl
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
dim_t n0;
float* x0;
inc_t incx0;
/* Initialize BLIS. */
//bli_init_auto();
if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha))
dim_t n0 = (dim_t)(*n);
float *x0 = x;
inc_t incx0 = (inc_t)(*incx);
/*
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
Return early when alpha pointer is NULL - BLIS exception
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
//bli_finalize_auto();
return;
}
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are less than or equal to zero, return. */
if ( (*incx) <= 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return ;
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
// Definition of function pointer
sscalv_ker_ft scalv_ker_ptr;
cntx_t *cntx = NULL;
// Query the architecture ID
arch_t id = bli_arch_query_id();
/*
Function pointer declaration for the function
that will be used by this API
*/
sscalv_ker_ft scalv_ker_ptr; // DSCALV
// Pick the kernel based on the architecture ID
switch (id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
break;
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
scalv_ker_ptr = bli_sscalv_zen_int10;
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
scalv_ker_ptr = bli_sscalv_zen_int10;
break;
break;
default:
default:
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// Query the context for the kernel function pointers for sscalv
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
// Query the context for the kernel function pointers for sscalv
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
}
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
n0,
-n0,
(float *)alpha,
x0, incx0,
cntx
);
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
}
#ifdef BLIS_ENABLE_BLAS
void sscal_
@@ -224,7 +204,7 @@ void sscal_
float* x, const f77_int* incx
)
{
sscal_blis_impl( n, alpha, x, incx );
sscal_blis_impl( n, alpha, x, incx );
}
#endif
void dscal_blis_impl
@@ -236,65 +216,54 @@ void dscal_blis_impl
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
dim_t n_elem;
#ifdef BLIS_ENABLE_OPENMP
dim_t ST_THRESH = 30000;
#endif
double* x0;
inc_t incx0;
/* Initialize BLIS */
/* Initialize BLIS. */
//bli_init_auto();
/* Convert typecast negative values of n to zero. */
if ( *n < 0 ) n_elem = ( dim_t )0;
else n_elem = ( dim_t )(*n);
dim_t n0 = (dim_t)(*n);
double *x0 = x;
inc_t incx0 = (inc_t)(*incx);
/*
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
Return early when alpha pointer is NULL - BLIS exception
*/
if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha))
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
//bli_finalize_auto();
return;
}
/* If the input increments are less than or equal to zero, return. */
if ( (*incx) <= 0 )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return ;
}
else
{
x0 = (x);
incx0 = ( inc_t )(*incx);
}
// Definition of function pointer
// Definition of function pointer
dscalv_ker_ft scalv_ker_ptr;
cntx_t *cntx = NULL;
#ifdef BLIS_ENABLE_OPENMP
dim_t ST_THRESH = 30000;
#endif
// Query the architecture ID
arch_t arch_id_local = bli_arch_query_id();
arch_t id = bli_arch_query_id();
// Pick the kernel based on the architecture ID
switch (arch_id_local)
switch (id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
// AVX512 Kernel
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
#ifdef BLIS_ENABLE_OPENMP
ST_THRESH = 30000;
ST_THRESH = 30000;
#endif
break;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_ker_ptr = bli_dscalv_zen_int10;
@@ -303,9 +272,9 @@ void dscal_blis_impl
#endif
break;
default:
default:
// Query the context
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// Query the function pointer using the context
@@ -315,25 +284,28 @@ void dscal_blis_impl
#ifdef BLIS_ENABLE_OPENMP
/*
If the optimial number of threads is 1, the OpenMP and
'bli_nthreads_l1'overheads are avoided by calling the
If the optimal number of threads is 1, the OpenMP and
'bli_nthreads_l1' overheads are avoided by calling the
function directly. This ensures that performance of dscalv
does not drop for single thread when OpenMP is enabled.
*/
if (n_elem <= ST_THRESH)
if (n0 <= ST_THRESH)
{
#endif
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
n_elem,
-n0,
(double *)alpha,
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
return;
#ifdef BLIS_ENABLE_OPENMP
}
@@ -354,14 +326,14 @@ void dscal_blis_impl
BLIS_SCALV_KER,
BLIS_DOUBLE,
BLIS_DOUBLE,
arch_id_local,
n_elem,
id,
n0,
&nt
);
_Pragma("omp parallel num_threads(nt)")
{
dim_t start, end, length;
dim_t start, end, length;
thrinfo_t thrinfo_vec;
// The block size is the minimum factor, whose multiple will ensure that only
@@ -383,7 +355,7 @@ void dscal_blis_impl
bli_thread_range_sub
(
&thrinfo_vec,
n_elem,
n0,
block_size,
FALSE,
&start,
@@ -396,22 +368,21 @@ void dscal_blis_impl
double *x_thread_local = x0 + (start * incx0);
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
length,
-length,
(double *)alpha,
x_thread_local, incx0,
cntx
);
}
/* Finalize BLIS. */
// bli_finalize_auto();
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
#endif
}
#ifdef BLIS_ENABLE_BLAS
void dscal_
@@ -421,7 +392,7 @@ void dscal_
double* x, const f77_int* incx
)
{
dscal_blis_impl( n, alpha, x, incx );
dscal_blis_impl( n, alpha, x, incx );
}
#endif
void zdscal_blis_impl
@@ -433,19 +404,23 @@ void zdscal_blis_impl
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx );
dim_t n_elem = (dim_t)(*n);
dcomplex* x0 = x;
inc_t incx0 = (inc_t)(*incx);
/* Initialize BLIS. */
//bli_init_auto();
dim_t n0 = (dim_t)(*n);
dcomplex* x0 = x;
inc_t incx0 = (inc_t)(*incx);
/*
When n is zero or the alpha pointer passed is null
or the incx is zero or alpha is 1, return early.
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
Return early when alpha pointer is NULL - BLIS exception
*/
if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
//bli_finalize_auto();
return;
}
@@ -458,30 +433,34 @@ void zdscal_blis_impl
cntx_t *cntx = NULL;
#ifdef BLIS_ENABLE_OPENMP
dim_t ST_THRESH = 10000;
#endif
// Query the architecture ID
arch_t arch_id_local = bli_arch_query_id();
arch_t id = bli_arch_query_id();
// Pick the kernel based on the architecture ID
switch (arch_id_local)
switch (id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_ker_ptr = bli_zdscalv_zen_int10;
break;
default:
default:
// Query the context
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// Query the function pointer using the context
@@ -489,6 +468,32 @@ void zdscal_blis_impl
}
#ifdef BLIS_ENABLE_OPENMP
/*
If the optimal number of threads is 1, the OpenMP and
'bli_nthreads_l1' overheads are avoided by calling the
function directly. This ensures that performance of dscalv
does not drop for single thread when OpenMP is enabled.
*/
if (n0 <= ST_THRESH)
{
#endif
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
-n0,
(dcomplex *)&alpha_cast,
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
return;
#ifdef BLIS_ENABLE_OPENMP
}
/*
Initializing the number of thread to one
@@ -506,33 +511,11 @@ void zdscal_blis_impl
BLIS_SCALV_KER,
BLIS_DCOMPLEX,
BLIS_DOUBLE,
arch_id_local,
n_elem,
id,
n0,
&nt
);
/*
If the number of optimum threads is 1, the OpenMP overhead
is avoided by calling the function directly
*/
if (nt == 1)
{
#endif
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
n_elem,
(dcomplex *)&alpha_cast,
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
#ifdef BLIS_ENABLE_OPENMP
}
_Pragma("omp parallel num_threads(nt)")
{
dim_t start, length;
@@ -549,7 +532,7 @@ void zdscal_blis_impl
*/
bli_thread_vector_partition
(
n_elem,
n0,
nt_use,
&start, &length,
thread_id
@@ -559,18 +542,21 @@ void zdscal_blis_impl
dcomplex *x_thread_local = x0 + (start * incx0);
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
length,
-length,
(dcomplex *)&alpha_cast,
x_thread_local, incx0,
cntx
);
}
#endif
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
#endif
}
#ifdef BLIS_ENABLE_BLAS
void zdscal_
@@ -594,22 +580,27 @@ void cscal_blis_impl
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', (void *)alpha, *n, *incx);
/* Initialize BLIS. */
//bli_init_auto();
dim_t n0 = (dim_t)(*n);
scomplex *x0 = x;
inc_t incx0 = (inc_t)(*incx);
/*
When n is zero or the alpha pointer passed is null
or the incx is zero or alpha is 1, return early.
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
Return early when alpha pointer is NULL - BLIS exception
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
//bli_finalize_auto();
return;
}
// Definition of function pointer
cscalv_ker_ft scalv_fun_ptr;
cscalv_ker_ft scalv_ker_ptr;
cntx_t* cntx = NULL;
@@ -622,40 +613,42 @@ void cscal_blis_impl
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_fun_ptr = bli_cscalv_zen_int_avx512;
break;
// AVX512 Kernel
scalv_ker_ptr = bli_cscalv_zen_int_avx512;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_fun_ptr = bli_cscalv_zen_int;
break;
// AVX2 Kernel
scalv_ker_ptr = bli_cscalv_zen_int;
break;
default:
// Query the context
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// Query the function pointer using the context
scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
}
// Call the function based on the function pointer assigned above
scalv_fun_ptr
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
n0,
-n0,
(scomplex*) alpha,
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
}
#ifdef BLIS_ENABLE_BLAS
void cscal_
(
@@ -678,22 +671,27 @@ void zscal_blis_impl
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx);
/* Initialize BLIS. */
//bli_init_auto();
dim_t n0 = (dim_t)(*n);
dcomplex *x0 = x;
inc_t incx0 = (inc_t)(*incx);
/*
When n is zero or the alpha pointer passed is null
or the incx is zero or alpha is 1, return early.
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
Return early when alpha pointer is NULL - BLIS exception
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
//bli_finalize_auto();
return;
}
// Definition of function pointer
zscalv_ker_ft scalv_fun_ptr;
zscalv_ker_ft scalv_ker_ptr;
cntx_t* cntx = NULL;
@@ -707,7 +705,7 @@ void zscal_blis_impl
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
// AVX512 Kernel
scalv_fun_ptr = bli_zscalv_zen_int_avx512;
scalv_ker_ptr = bli_zscalv_zen_int_avx512;
break;
#endif
case BLIS_ARCH_ZEN:
@@ -715,29 +713,32 @@ void zscal_blis_impl
case BLIS_ARCH_ZEN3:
// AVX2 Kernel
scalv_fun_ptr = bli_zscalv_zen_int;
scalv_ker_ptr = bli_zscalv_zen_int;
break;
default:
// Query the context
// For non-Zen architectures, query the context
cntx = bli_gks_query_cntx();
// Query the function pointer using the context
scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
}
// Call the function based on the function pointer assigned above
scalv_fun_ptr
// Invoke the function based on the kernel function pointer
// Pass size as negative to stipulate don't use SETV when alpha=0
scalv_ker_ptr
(
BLIS_NO_CONJUGATE,
n0,
-n0,
(dcomplex*) alpha,
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
//bli_finalize_auto();
}
#ifdef BLIS_ENABLE_BLAS
void zscal_
@@ -751,4 +752,4 @@ void zscal_
}
#endif
GENTFUNCSCAL( scomplex, float, c, s, scal, scalv )
GENTFUNCSCAL( scomplex, float, c, s, s, scal, scalv )

View File

@@ -174,12 +174,12 @@ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname )
#define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
\
GENTFUNCSCAL( float, float, s, , blasname, blisname ) \
GENTFUNCSCAL( double, double, d, , blasname, blisname ) \
GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \
GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \
GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \
GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname )
GENTFUNCSCAL( float, float, s, , s, blasname, blisname ) \
GENTFUNCSCAL( double, double, d, , d, blasname, blisname ) \
GENTFUNCSCAL( scomplex, scomplex, c, , c, blasname, blisname ) \
GENTFUNCSCAL( dcomplex, dcomplex, z, , z, blasname, blisname ) \
GENTFUNCSCAL( scomplex, float, c, s, s, blasname, blisname ) \
GENTFUNCSCAL( dcomplex, double, z, d, d, blasname, blisname )
// --GEMMT specific kernels ----------------------------------------------------

View File

@@ -36,10 +36,10 @@
#include "test_scalv.h"
class cscalvGeneric :
public ::testing::TestWithParam<std::tuple<char,
gtint_t,
gtint_t,
scomplex>> {};
public ::testing::TestWithParam<std::tuple<char, // conj_alpha
gtint_t, // n
gtint_t, // incx
scomplex>> {}; // alpha
// Tests using random integers as vector elements.
@@ -78,42 +78,140 @@ TEST_P( cscalvGeneric, API )
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
}
// Black box testing for generic and main use of cscal.
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
Blackbox,
unitPositiveIncrementSmall,
cscalvGeneric,
::testing::Combine(
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
scomplex{-5.1, -7.3},
scomplex{ 1.0, 1.0},
scomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<scomplex, scomplex>())
);
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrementLarge,
cscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
scomplex{-5.1, -7.3},
scomplex{ 1.0, 1.0},
scomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<scomplex, scomplex>())
);
// Test for non-unit increments.
// Only test very few cases as sanity check.
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementSmall,
cscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(9), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
scomplex{-5.1, -7.3},
scomplex{ 1.0, 1.0},
scomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<scomplex, scomplex>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
cscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
scomplex{-5.1, -7.3},
scomplex{ 1.0, 1.0},
scomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<scomplex, scomplex>())
);
#ifndef TEST_BLIS_TYPED
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
cscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
scomplex{ 0.0, 0.0}
)
),
(::scalvGenericPrint<scomplex, scomplex>())
);
#endif
#ifdef TEST_BLIS_TYPED
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
// Only test very few cases as sanity check since conj(x) = x for real types.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NonUnitPositiveIncrements,
conjalpha,
cscalvGeneric,
::testing::Combine(
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x
::testing::Values(scomplex{4.0, 3.1}) // alpha
::testing::Values('c'), // c: use conjugate
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(scomplex{ 7.3, 5.1}) // alpha
),
(::scalvGenericPrint<scomplex, scomplex>())
);
#endif
#ifndef TEST_BLIS_TYPED
// Test for negative increments.
@@ -126,7 +224,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('n'), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
::testing::Values(scomplex{4.0, 3.1}) // alpha
::testing::Values(scomplex{ 7.3, 5.1}) // alpha
),
(::scalvGenericPrint<scomplex, scomplex>())
);

View File

@@ -0,0 +1,219 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <gtest/gtest.h>
#include "test_scalv.h"
class csscalvGeneric :
public ::testing::TestWithParam<std::tuple<char, // conj_alpha
gtint_t, // n
gtint_t, // incx
float>> {}; // alpha
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csscalvGeneric);
// Tests using random integers as vector elements.
TEST_P( csscalvGeneric, API )
{
using T = scomplex;
using U = float;
//----------------------------------------------------------
// Initialize values from the parameters passed through
// test suite instantiation (INSTANTIATE_TEST_SUITE_P).
//----------------------------------------------------------
// denotes whether alpha or conj(alpha) will be used:
char conj_alpha = std::get<0>(GetParam());
// vector length:
gtint_t n = std::get<1>(GetParam());
// stride size for x:
gtint_t incx = std::get<2>(GetParam());
// alpha
U alpha = std::get<3>(GetParam());
// Set the threshold for the errors:
// Check gtestsuite scalv.h or netlib source code for reminder of the
// functionality from which we estimate operation count per element
// of output, and hence the multipler for epsilon.
// No adjustment applied yet for complex data.
double thresh;
if (n == 0)
thresh = 0.0;
else if (alpha == testinghelpers::ZERO<U>() || alpha == testinghelpers::ONE<U>())
thresh = 0.0;
else
thresh = testinghelpers::getEpsilon<T>();
//----------------------------------------------------------
// Call generic test body using those parameters
//----------------------------------------------------------
test_scalv<T, U>( conj_alpha, n, incx, alpha, thresh );
}
// bli_csscal not present in BLIS
#ifndef TEST_BLIS_TYPED
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrementSmall,
csscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrementLarge,
csscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementSmall,
csscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(9), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
csscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
csscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0)
)
),
(::scalvGenericPrint<float, float>())
);
// Test for negative increments.
// Only test very few cases as sanity check.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NegativeIncrements,
csscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
::testing::Values(3) // alpha
),
(::scalvGenericPrint<float, float>())
);
#endif // not TEST_BLIS_TYPED

View File

@@ -79,20 +79,41 @@ TEST_P( dscalvGeneric, API )
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrement,
unitPositiveIncrementSmall,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<double, double>())
);
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrementLarge,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0),
double( 7.0),
double(-3.0)
)
@@ -101,21 +122,20 @@ INSTANTIATE_TEST_SUITE_P(
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrement,
nonUnitPositiveIncrementSmall,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Range(gtint_t(1), gtint_t(9), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(3)
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0),
double( 7.0),
double(-3.0)
)
@@ -123,6 +143,54 @@ INSTANTIATE_TEST_SUITE_P(
(::scalvGenericPrint<double, double>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<double, double>())
);
#ifndef TEST_BLIS_TYPED
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0)
)
),
(::scalvGenericPrint<double, double>())
);
#endif
#ifdef TEST_BLIS_TYPED
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
// Only test very few cases as sanity check since conj(x) = x for real types.
@@ -140,6 +208,23 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifndef TEST_BLIS_TYPED
// Test for negative increments.
// Only test very few cases as sanity check.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NegativeIncrements,
dscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
::testing::Values(3) // alpha
),
(::scalvGenericPrint<double, double>())
);
#endif
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
INSTANTIATE_TEST_SUITE_P(
AOCLDynamic,
@@ -151,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
gtint_t( 30000), // nt_ideal = 1
gtint_t( 100000), // nt_ideal = 2
gtint_t( 486919), // nt_ideal = 8
gtint_t( 500000), // nt_ideal = 8
gtint_t( 2500000), // nt_ideal = 12
gtint_t( 4000000), // nt_ideal = 16
@@ -160,7 +246,8 @@ INSTANTIATE_TEST_SUITE_P(
),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
gtint_t(1),
gtint_t(3)
),
// alpha: value of scalar.
::testing::Values(
@@ -169,4 +256,34 @@ INSTANTIATE_TEST_SUITE_P(
),
(::scalvGenericPrint<double, double>())
);
#ifndef TEST_BLIS_TYPED
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
AOCLDynamicAlphaZero,
dscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(
gtint_t( 89), // nt_ideal = 8
gtint_t( 486919), // nt_ideal = 8
gtint_t(25000000) // nt_ideal = max_available
),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(3)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0)
)
),
(::scalvGenericPrint<double, double>())
);
#endif
#endif // BLIS_ENABLE_OPENMP && AOCL_DYNAMIC

View File

@@ -1,117 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <gtest/gtest.h>
#include "test_scalv.h"
template <typename T>
class scalv_EIC : public ::testing::Test {};
typedef ::testing::Types<float, double> TypeParam;
TYPED_TEST_SUITE(scalv_EIC, TypeParam);
TYPED_TEST(scalv_EIC, zero_alpha_x_fp)
{
using T = TypeParam;
gtint_t n = 10, incx = 1;
std::vector<T> x(n);
// Initialize x with random numbers.
testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
std::vector<T> x_ref(x);
T alpha = T{0};
testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
//----------------------------------------------------------
// Call BLIS function.
//----------------------------------------------------------
scalv<T>('n', n, alpha, x.data(), incx);
//----------------------------------------------------------
// Compute component-wise error.
//----------------------------------------------------------
// Set the threshold for the errors:
// Check gtestsuite scalv.h or netlib source code for reminder of the
// functionality from which we estimate operation count per element
// of output, and hence the multipler for epsilon.
double thresh;
if (n == 0)
thresh = 0.0;
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
thresh = 0.0;
else
thresh = testinghelpers::getEpsilon<T>();
//----------------------------------------------------------
// Call generic test body using those parameters
//----------------------------------------------------------
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
}
TYPED_TEST(scalv_EIC, zero_alpha_x_inf)
{
using T = TypeParam;
gtint_t n = 10, incx = 1;
std::vector<T> x(n);
// Initialize x with random numbers.
testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
x[3] = 1.0/0.0;
std::vector<T> x_ref(x);
T alpha = T{0};
testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
//----------------------------------------------------------
// Call BLIS function.
//----------------------------------------------------------
scalv<T>('n', n, alpha, x.data(), incx);
//----------------------------------------------------------
// Compute component-wise error.
//----------------------------------------------------------
// Set the threshold for the errors:
// Check gtestsuite scalv.h or netlib source code for reminder of the
// functionality from which we estimate operation count per element
// of output, and hence the multipler for epsilon.
// No adjustment applied yet for complex data.
double thresh;
if (n == 0)
thresh = 0.0;
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
thresh = 0.0;
else
thresh = testinghelpers::getEpsilon<T>();
//----------------------------------------------------------
// Call generic test body using those parameters
//----------------------------------------------------------
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
}

View File

@@ -63,7 +63,7 @@ TEST_P( sscalvGeneric, API )
// Check gtestsuite scalv.h or netlib source code for reminder of the
// functionality from which we estimate operation count per element
// of output, and hence the multipler for epsilon.
double thresh;
float thresh;
if (n == 0)
thresh = 0.0;
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
@@ -77,19 +77,120 @@ TEST_P( sscalvGeneric, API )
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
}
// Black box testing for generic and main use of sscal.
// Black box testing for generic use of sscal.
INSTANTIATE_TEST_SUITE_P(
Blackbox,
unitPositiveIncrementSmall,
sscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x, not conj(x) (since it is real)
::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(float(3.0), float(-5.0)) // alpha
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrementLarge,
sscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementSmall,
sscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(17), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
sscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
float( 7.0),
float(-3.0)
)
),
(::scalvGenericPrint<float, float>())
);
#ifndef TEST_BLIS_TYPED
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
sscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
float( 0.0)
)
),
(::scalvGenericPrint<float, float>())
);
#endif
#ifdef TEST_BLIS_TYPED
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
// Only test very few cases as sanity check since conj(x) = x for real types.
@@ -101,28 +202,12 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values('c'), // c: use conjugate
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(float(9.0)) // alpha
::testing::Values(float(-3.0)) // alpha
),
(::scalvGenericPrint<float, float>())
);
#endif
// Test for non-unit increments.
// Only test very few cases as sanity check.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NonUnitPositiveIncrements,
sscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x
::testing::Values(float(2.0)) // alpha
),
(::scalvGenericPrint<float, float>())
);
#ifndef TEST_BLIS_TYPED
// Test for negative increments.
// Only test very few cases as sanity check.

View File

@@ -48,6 +48,8 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
// Initialize vector with random numbers.
//----------------------------------------------------------
std::vector<T> x = testinghelpers::get_random_vector<T>( -10, 10, n, incx );
if (alpha == testinghelpers::ZERO<U>())
testinghelpers::set_vector( n, incx, x.data(), testinghelpers::aocl_extreme<T>() );
//----------------------------------------------------------
// Call reference implementation to get ref results.
@@ -64,7 +66,7 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
//----------------------------------------------------------
// Compute component-wise error.
//----------------------------------------------------------
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh );
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
}
/**

View File

@@ -82,59 +82,187 @@ TEST_P( zdscalvGeneric, API )
// bli_zdscal not present in BLIS
#ifndef TEST_BLIS_TYPED
// Black box testing for zdscal.
// Tests with unit-positive increment.
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrement,
unitPositiveIncrementSmall,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
),
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(gtint_t(1)),
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
double(-5.1),
double( 0.0),
double( 7.3)
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<dcomplex, double>())
(::scalvGenericPrint<double, double>())
);
// Tests for non-unit increments.
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrement,
unitPositiveIncrementLarge,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
),
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<double, double>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementSmall,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(9), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<double, double>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 7.0),
double(-3.0)
)
),
(::scalvGenericPrint<double, double>())
);
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0)
)
),
(::scalvGenericPrint<double, double>())
);
// Test for negative increments.
// Only test very few cases as sanity check.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NegativeIncrements,
zdscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
::testing::Values(3) // alpha
),
(::scalvGenericPrint<double, double>())
);
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
INSTANTIATE_TEST_SUITE_P(
AOCLDynamic,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(
gtint_t( 10000), // nt_ideal = 1
gtint_t( 20000), // nt_ideal = 4
gtint_t( 486919), // nt_ideal = 8
gtint_t( 1000000), // nt_ideal = 8
gtint_t( 2500000), // nt_ideal = 12
gtint_t( 5000000), // nt_ideal = 32
gtint_t( 7000000) // nt_ideal = max_available
),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(3)
),
// alpha: value of scalar.
::testing::Values(
double(-5.1),
double( 0.0),
double( 7.3)
double( 7.0)
)
),
(::scalvGenericPrint<dcomplex, double>())
(::scalvGenericPrint<double, double>())
);
INSTANTIATE_TEST_SUITE_P(
AOCLDynamicAlphaZero,
zdscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(
gtint_t( 486919), // nt_ideal = 8
gtint_t( 7000000) // nt_ideal = max_available
),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(3)
),
// alpha: value of scalar.
::testing::Values(
double( 0.0)
)
),
(::scalvGenericPrint<double, double>())
);
#endif
#endif // not TEST_BLIS_TYPED

View File

@@ -78,26 +78,22 @@ TEST_P( zscalvGeneric, API )
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
}
// Black box testing for zscal.
// Tests with unit-positive increment.
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
unitPositiveIncrement,
unitPositiveIncrementSmall,
zscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
),
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(gtint_t(1)),
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
dcomplex{-5.1, -7.3},
dcomplex{ 0.0, 0.0},
dcomplex{ 1.0, 1.0},
dcomplex{ 7.3, 5.1}
)
@@ -105,32 +101,131 @@ INSTANTIATE_TEST_SUITE_P(
(::scalvGenericPrint<dcomplex, dcomplex>())
);
// Test for non-unit increments.
// Black box testing for generic use of dscal.
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrement,
unitPositiveIncrementLarge,
zscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'
#ifdef TEST_BLIS_TYPED
, 'c' // this option is BLIS-api specific.
#endif
),
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(10), gtint_t(101), 10),
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(1)
),
// alpha: value of scalar.
::testing::Values(
dcomplex{-5.1, -7.3},
dcomplex{ 1.0, 1.0},
dcomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementSmall,
zscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(9), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(3)
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
dcomplex{-5.1, -7.3},
dcomplex{ 0.0, 0.0},
dcomplex{ 1.0, 1.0},
dcomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
INSTANTIATE_TEST_SUITE_P(
nonUnitPositiveIncrementLarge,
zscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
// incx: stride of x vector.
::testing::Values(
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
dcomplex{-5.1, -7.3},
dcomplex{ 1.0, 1.0},
dcomplex{ 7.3, 5.1}
)
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
#ifndef TEST_BLIS_TYPED
// alpha=0 testing only for BLAS and CBLAS as
// BLIS uses setv and won't propagate Inf and NaNs
INSTANTIATE_TEST_SUITE_P(
alphaZero,
zscalvGeneric,
::testing::Combine(
// conj(alpha): uses n (no_conjugate) since it is real.
::testing::Values('n'),
// m: size of vector.
::testing::Range(gtint_t(1), gtint_t(101), 1),
// incx: stride of x vector.
::testing::Values(
gtint_t(1),
gtint_t(2),
gtint_t(41)
),
// alpha: value of scalar.
::testing::Values(
dcomplex{ 0.0, 0.0}
)
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
#endif
#ifdef TEST_BLIS_TYPED
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
// Only test very few cases as sanity check since conj(x) = x for real types.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
conjalpha,
zscalvGeneric,
::testing::Combine(
::testing::Values('c'), // c: use conjugate
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(dcomplex{ 7.3, 5.1}) // alpha
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
#endif
#ifndef TEST_BLIS_TYPED
// Test for negative increments.
// Only test very few cases as sanity check.
// We can modify the values using implementantion details.
INSTANTIATE_TEST_SUITE_P(
NegativeIncrements,
zscalvGeneric,
::testing::Combine(
::testing::Values('n'), // n: use x, c: use conj(x)
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
::testing::Values(dcomplex{ 7.3, 5.1}) // alpha
),
(::scalvGenericPrint<dcomplex, dcomplex>())
);
#endif

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -80,9 +80,11 @@ void bli_sscalv_zen_int
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
// If alpha is zero, use setv (in case y contains NaN or Inf).
if ( PASTEMAC(s,eq0)( *alpha ) )
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
{
float* zero = bli_s0;
if (cntx == NULL) cntx = bli_gks_query_cntx();
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
f
@@ -96,10 +98,12 @@ void bli_sscalv_zen_int
return;
}
dim_t n0 = bli_abs(n);
// Use the unrolling factor and the number of elements per register
// to compute the number of vectorized and leftover iterations.
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
// If there is anything that would interfere with our use of contiguous
// vector loads/stores, override n_viter and n_left to use scalar code
@@ -107,7 +111,7 @@ void bli_sscalv_zen_int
if ( incx != 1 )
{
n_viter = 0;
n_left = n;
n_left = n0;
}
// Initialize local pointers.
@@ -178,10 +182,11 @@ void bli_dscalv_zen_int
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
// If alpha is zero, use setv (in case y contains NaN or Inf).
if ( PASTEMAC(d,eq0)( *alpha ) )
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
{
double* zero = bli_d0;
if (cntx == NULL) cntx = bli_gks_query_cntx();
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
f
@@ -195,10 +200,12 @@ void bli_dscalv_zen_int
return;
}
dim_t n0 = bli_abs(n);
// Use the unrolling factor and the number of elements per register
// to compute the number of vectorized and leftover iterations.
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
// If there is anything that would interfere with our use of contiguous
// vector loads/stores, override n_viter and n_left to use scalar code
@@ -206,7 +213,7 @@ void bli_dscalv_zen_int
if ( incx != 1 )
{
n_viter = 0;
n_left = n;
n_left = n0;
}
// Initialize local pointers.

View File

@@ -60,8 +60,8 @@ void bli_sscalv_zen_int10
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
// If alpha is zero, use setv.
if ( PASTEMAC(s,eq0)( *alpha ) )
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
{
float* zero = bli_s0;
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -78,6 +78,8 @@ void bli_sscalv_zen_int10
return;
}
dim_t n0 = bli_abs(n);
// Initialize local pointers.
x0 = x;
@@ -88,11 +90,11 @@ void bli_sscalv_zen_int10
dim_t option;
// Unroll and the loop used is picked based on the input size.
if( n < 300)
if( n0 < 300)
{
option = 2;
}
else if( n < 500)
else if( n0 < 500)
{
option = 1;
}
@@ -105,7 +107,7 @@ void bli_sscalv_zen_int10
{
case 0:
for ( ; (i + 127) < n; i += 128 )
for ( ; (i + 127) < n0; i += 128 )
{
//Load the input values
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -175,7 +177,7 @@ void bli_sscalv_zen_int10
case 1 :
for ( ; (i + 95) < n; i += 96 )
for ( ; (i + 95) < n0; i += 96 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -227,7 +229,7 @@ void bli_sscalv_zen_int10
case 2:
for ( ; (i + 47) < n; i += 48 )
for ( ; (i + 47) < n0; i += 48 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -256,7 +258,7 @@ void bli_sscalv_zen_int10
x0 += 6*n_elem_per_reg;
}
for ( ; (i + 23) < n; i += 24 )
for ( ; (i + 23) < n0; i += 24 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
@@ -273,7 +275,7 @@ void bli_sscalv_zen_int10
x0 += 3*n_elem_per_reg;
}
for ( ; (i + 7) < n; i += 8 )
for ( ; (i + 7) < n0; i += 8 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -284,7 +286,7 @@ void bli_sscalv_zen_int10
x0 += 1*n_elem_per_reg;
}
for ( ; (i + 0) < n; i += 1 )
for ( ; (i + 0) < n0; i += 1 )
{
*x0 *= *alpha;
@@ -296,7 +298,7 @@ void bli_sscalv_zen_int10
{
const float alphac = *alpha;
for ( ; i < n; ++i )
for ( ; i < n0; ++i )
{
*x0 *= alphac;
@@ -329,8 +331,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
// If alpha is zero, use setv.
if ( PASTEMAC(d,eq0)( *alpha ) )
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
{
double* zero = bli_d0;
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -348,6 +350,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
return;
}
dim_t n0 = bli_abs(n);
// Initialize local pointers.
x0 = x;
@@ -358,11 +362,11 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
dim_t option;
// Unroll and the loop used is picked based on the input size.
if(n < 200)
if(n0 < 200)
{
option = 2;
}
else if(n < 500)
else if(n0 < 500)
{
option = 1;
}
@@ -375,7 +379,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
{
case 0:
for (; (i + 63) < n; i += 64 )
for (; (i + 63) < n0; i += 64 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -440,7 +444,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
x0 += 16*n_elem_per_reg;
}
for (; (i + 47) < n; i += 48 )
for (; (i + 47) < n0; i += 48 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -492,7 +496,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
case 1:
for (; (i + 31) < n; i += 32 )
for (; (i + 31) < n0; i += 32 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -529,7 +533,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
case 2:
for ( ; (i + 11) < n; i += 12 )
for ( ; (i + 11) < n0; i += 12 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
@@ -546,7 +550,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
x0 += 3*n_elem_per_reg;
}
for ( ; (i + 3) < n; i += 4 )
for ( ; (i + 3) < n0; i += 4 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -557,7 +561,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
x0 += 1*n_elem_per_reg;
}
for ( ; (i + 0) < n; i += 1 )
for ( ; (i + 0) < n0; i += 1 )
{
*x0 *= *alpha;
@@ -569,7 +573,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
{
const double alphac = *alpha;
for ( ; i < n; ++i )
for ( ; i < n0; ++i )
{
*x0 *= alphac;
@@ -587,6 +591,30 @@ void bli_zdscalv_zen_int10
cntx_t* restrict cntx
)
{
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha )) return;
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
dcomplex *zero = bli_z0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx,
NULL);
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
const dim_t n_elem_per_reg = 4; // number of elements per register
@@ -607,7 +635,7 @@ void bli_zdscalv_zen_int10
alphav = _mm256_broadcast_sd( &alphac );
for ( ; ( i + 29 ) < n; i += 30 )
for ( ; ( i + 29 ) < n0; i += 30 )
{
xv[0] = _mm256_loadu_pd( x0 );
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -660,7 +688,7 @@ void bli_zdscalv_zen_int10
x0 += 15 * n_elem_per_reg;
}
for ( ; ( i + 23 ) < n; i += 24 )
for ( ; ( i + 23 ) < n0; i += 24 )
{
xv[0] = _mm256_loadu_pd( x0 );
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -704,7 +732,7 @@ void bli_zdscalv_zen_int10
x0 += 12 * n_elem_per_reg;
}
for ( ; ( i + 15 ) < n; i += 16 )
for ( ; ( i + 15 ) < n0; i += 16 )
{
xv[0] = _mm256_loadu_pd( x0 );
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -736,7 +764,7 @@ void bli_zdscalv_zen_int10
x0 += 8 * n_elem_per_reg;
}
for ( ; ( i + 7 ) < n; i += 8 )
for ( ; ( i + 7 ) < n0; i += 8 )
{
xv[0] = _mm256_loadu_pd( x0 );
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -756,7 +784,7 @@ void bli_zdscalv_zen_int10
x0 += 4 * n_elem_per_reg;
}
for ( ; ( i + 3 ) < n; i += 4 )
for ( ; ( i + 3 ) < n0; i += 4 )
{
xv[0] = _mm256_loadu_pd( x0 );
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
@@ -770,7 +798,7 @@ void bli_zdscalv_zen_int10
x0 += 2 * n_elem_per_reg;
}
for ( ; ( i + 1 ) < n; i += 2 )
for ( ; ( i + 1 ) < n0; i += 2 )
{
xv[0] = _mm256_loadu_pd( x0 );
@@ -795,7 +823,7 @@ void bli_zdscalv_zen_int10
alpha_reg = _mm_set1_pd((*alpha).real);
for (; i < n; ++i)
for (; i < n0; ++i)
{
x_vec = _mm_loadu_pd(x0);
@@ -816,24 +844,14 @@ void bli_cscalv_zen_int
cntx_t* restrict cntx
)
{
/*
Undefined behaviour
-------------------
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
1. This layer is not BLAS complaint and the kernel results in
undefined behaviour when n <= 0 and incx <= 1. The expectation
is that the application/higher-layer invoking this layer should
the arg checks.
*/
// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
// return;
// To Do: This call to SETV needs to be removed for BLAS compliance
// Currently removing this is resulting in ZHERK failures
if (PASTEMAC(c, eq0)(*alpha))
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
scomplex *zero = PASTEMAC(c, 0);
scomplex *zero = bli_c0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -848,6 +866,8 @@ void bli_cscalv_zen_int
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
scomplex alpha_conj;
float *x0 = (float *)x;
@@ -897,7 +917,7 @@ void bli_cscalv_zen_int
and then store
*/
for (; (i + 15) < n; i += 16)
for (; (i + 15) < n0; i += 16)
{
x_vec_ymm[0] = _mm256_loadu_ps(x0);
x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -927,7 +947,7 @@ void bli_cscalv_zen_int
x0 += 4 * n_elem_per_reg;
}
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
x_vec_ymm[0] = _mm256_loadu_ps(x0);
x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
@@ -947,7 +967,7 @@ void bli_cscalv_zen_int
x0 += 2 * n_elem_per_reg;
}
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
x_vec_ymm[0] = _mm256_loadu_ps(x0);
@@ -969,7 +989,7 @@ void bli_cscalv_zen_int
_mm256_zeroupper();
}
for (; i < n; i++)
for (; i < n0; i++)
{
float x_real, x_imag;
x_real = real * (*x0) - imag * (*(x0 + 1));
@@ -991,24 +1011,14 @@ void bli_zscalv_zen_int
cntx_t* restrict cntx
)
{
/*
Undefined behaviour
-------------------
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
1. This layer is not BLAS complaint and the kernel results in
undefined behaviour when n <= 0 and incx <= 1. The expectation
is that the application/higher-layer invoking this layer should
the arg checks.
*/
// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
// return;
// To Do: This call to SETV needs to be removed for BLAS compliance
// Currently removing this is resulting in ZHERK failures
if (PASTEMAC(z, eq0)(*alpha))
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
dcomplex *zero = PASTEMAC(z, 0);
dcomplex *zero = bli_z0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -1023,6 +1033,8 @@ void bli_zscalv_zen_int
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
dcomplex alpha_conj;
double *x0 = (double *)x;
@@ -1033,8 +1045,8 @@ void bli_zscalv_zen_int
double real = alpha_conj.real;
double imag = alpha_conj.imag;
/*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/
if (incx == 1 && n >= 2)
/*When incx is 1 and n0 >= 2 it is possible to use AVX2 instructions*/
if (incx == 1 && n0 >= 2)
{
dim_t const n_elem_per_reg = 4;
@@ -1072,7 +1084,7 @@ void bli_zscalv_zen_int
and then store
*/
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
x_vec_ymm[0] = _mm256_loadu_pd(x0);
x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1106,7 +1118,7 @@ void bli_zscalv_zen_int
x0 += 4 * n_elem_per_reg;
}
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
x_vec_ymm[0] = _mm256_loadu_pd(x0);
x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
@@ -1126,7 +1138,7 @@ void bli_zscalv_zen_int
x0 += 2 * n_elem_per_reg;
}
for (; (i + 1) < n; i += 2)
for (; (i + 1) < n0; i += 2)
{
x_vec_ymm[0] = _mm256_loadu_pd(x0);
@@ -1155,7 +1167,7 @@ void bli_zscalv_zen_int
alpha_real_xmm = _mm_set1_pd(real);
alpha_imag_xmm = _mm_set1_pd(imag);
for (; i < n; i++)
for (; i < n0; i++)
{
x_vec_xmm = _mm_loadu_pd(x0);

View File

@@ -61,13 +61,14 @@
Deviation from BLAS
--------------------
None
Setv is used when alpha=0 unless a negative value of n is supplied.
This only occurs in calls from BLAS and CBLAS scal APIs.
Undefined behaviour
-------------------
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
is that these are standard BLAS exceptions and should be handled in a higher layer.
None
*/
void bli_sscalv_zen_int_avx512
(
@@ -78,6 +79,30 @@ void bli_sscalv_zen_int_avx512
cntx_t *restrict cntx
)
{
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
{
float *zero = bli_s0;
if (cntx == NULL) cntx = bli_gks_query_cntx();
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SETV_KER, cntx);
f
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx
);
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
float *restrict x0 = x;
@@ -89,7 +114,7 @@ void bli_sscalv_zen_int_avx512
__m512 xv[8], alphav;
alphav = _mm512_set1_ps(*alpha);
for (i = 0; (i + 127) < n; i += 128)
for (i = 0; (i + 127) < n0; i += 128)
{
// Loading the input values
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -125,7 +150,7 @@ void bli_sscalv_zen_int_avx512
x0 += 8 * n_elem_per_reg;
}
for (; (i + 63) < n; i += 64)
for (; (i + 63) < n0; i += 64)
{
// Loading the input values
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -147,7 +172,7 @@ void bli_sscalv_zen_int_avx512
x0 += 4 * n_elem_per_reg;
}
for (; (i + 31) < n; i += 32)
for (; (i + 31) < n0; i += 32)
{
// Loading the input values
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -163,7 +188,7 @@ void bli_sscalv_zen_int_avx512
x0 += 2 * n_elem_per_reg;
}
for (; (i + 15) < n; i += 16)
for (; (i + 15) < n0; i += 16)
{
// Loading the input values
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
@@ -176,7 +201,7 @@ void bli_sscalv_zen_int_avx512
x0 += n_elem_per_reg;
}
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
// Loading the input values
__m256 x_vec = _mm256_loadu_ps(x0);
@@ -198,7 +223,7 @@ void bli_sscalv_zen_int_avx512
*/
_mm256_zeroupper();
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
// Loading the input values
__m128 x_vec = _mm_loadu_ps(x0);
@@ -215,7 +240,7 @@ void bli_sscalv_zen_int_avx512
const float alphac = *alpha;
for (; i < n; ++i)
for (; i < n0; ++i)
{
*x0 *= alphac;
@@ -252,13 +277,14 @@ void bli_sscalv_zen_int_avx512
Deviation from BLAS
--------------------
None
Setv is used when alpha=0 unless a negative value of n is supplied.
This only occurs in calls from BLAS and CBLAS scal APIs.
Undefined behaviour
-------------------
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
is that these are standard BLAS exceptions and should be handled in a higher layer.
None
*/
BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
(
@@ -270,11 +296,10 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
)
{
// If the vector dimension is zero, or if alpha is unit, return early.
if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha))
return;
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
// If alpha is zero, use setv.
if (PASTEMAC(d, eq0)(*alpha))
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
{
double *zero = bli_d0;
if (cntx == NULL) cntx = bli_gks_query_cntx();
@@ -292,6 +317,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
double *restrict x0;
@@ -307,7 +334,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
alphav = _mm512_set1_pd(*alpha);
__m512d xv[8];
for (i = 0; (i + 63) < n; i += 64)
for (i = 0; (i + 63) < n0; i += 64)
{
// Loading the input values
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -343,7 +370,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
x0 += 8 * n_elem_per_reg;
}
for (; (i + 31) < n; i += 32)
for (; (i + 31) < n0; i += 32)
{
// Loading the input values
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -365,7 +392,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
x0 += 4 * n_elem_per_reg;
}
for (; (i + 15) < n; i += 16)
for (; (i + 15) < n0; i += 16)
{
// Loading the input values
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -381,7 +408,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
x0 += 2 * n_elem_per_reg;
}
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
// Loading the input values
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
@@ -394,7 +421,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
x0 += n_elem_per_reg;
}
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
// Loading the input values
__m256d x_vec = _mm256_loadu_pd(x0);
@@ -416,7 +443,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
*/
_mm256_zeroupper();
for (; (i + 1) < n; i += 2)
for (; (i + 1) < n0; i += 2)
{
// Loading the input values
__m128d x_vec = _mm_loadu_pd(x0);
@@ -433,7 +460,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
const double alphac = *alpha;
for (; i < n; ++i)
for (; i < n0; ++i)
{
*x0 *= alphac;
@@ -468,13 +495,14 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
Deviation from BLAS
--------------------
None
Setv is used when alpha=0 unless a negative value of n is supplied.
This only occurs in calls from BLAS and CBLAS scal APIs.
Undefined behaviour
-------------------
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
is that these are standard BLAS exceptions and should be handled in a higher layer.
None
*/
void bli_zdscalv_zen_int_avx512
(
@@ -491,6 +519,31 @@ void bli_zdscalv_zen_int_avx512
alpha is passed as double complex to adhere
to function pointer definition in BLIS
*/
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
dcomplex *zero = bli_z0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx,
NULL);
return;
}
dim_t n0 = bli_abs(n);
const double alphac = (*alpha).real;
dim_t i = 0;
@@ -504,7 +557,7 @@ void bli_zdscalv_zen_int_avx512
alphav = _mm512_set1_pd(alphac);
for (; (i + 15) < n; i += 16)
for (; (i + 15) < n0; i += 16)
{
xv[0] = _mm512_loadu_pd(x0);
xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -524,7 +577,7 @@ void bli_zdscalv_zen_int_avx512
x0 += 4 * n_elem_per_reg;
}
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
xv[0] = _mm512_loadu_pd(x0);
xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
@@ -538,7 +591,7 @@ void bli_zdscalv_zen_int_avx512
x0 += 2 * n_elem_per_reg;
}
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
xv[0] = _mm512_loadu_pd(x0);
@@ -549,7 +602,7 @@ void bli_zdscalv_zen_int_avx512
x0 += n_elem_per_reg;
}
for (; (i + 1) < n; i += 2)
for (; (i + 1) < n0; i += 2)
{
__m256d xv = _mm256_loadu_pd(x0);
@@ -576,7 +629,7 @@ void bli_zdscalv_zen_int_avx512
alpha_reg = _mm_set1_pd((*alpha).real);
for (; i < n; ++i)
for (; i < n0; ++i)
{
x_vec = _mm_loadu_pd(x0);
@@ -674,8 +727,8 @@ void bli_zdscalv_zen_int_avx512
Undefined behaviour
-------------------
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
is that these are standard BLAS exceptions and should be handled in a higher layer.
None
*/
void bli_cscalv_zen_int_avx512
(
@@ -689,14 +742,11 @@ void bli_cscalv_zen_int_avx512
// If the vector dimension is zero, or if alpha is unit, return early.
if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
/**
* @note Currently this kernel is not BLAS compliant. For BLAS compliance,
* the below call to SETV needs to be removed.
*/
if ( PASTEMAC(c,eq0)(*alpha) )
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
scomplex *zero = PASTEMAC(c,0);
scomplex *zero = bli_c0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
@@ -712,6 +762,8 @@ void bli_cscalv_zen_int_avx512
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
scomplex alpha_conj;
float* restrict x0 = (float*) x;
@@ -760,7 +812,7 @@ void bli_cscalv_zen_int_avx512
*/
// Processing 96 scomplex elements (192 floats) per iteration
for ( ; (i + 95) < n; i += 96 )
for ( ; (i + 95) < n0; i += 96 )
{
__m512 xv[12], inter[12];
@@ -776,7 +828,7 @@ void bli_cscalv_zen_int_avx512
}
// Processing 64 scomplex elements (128 floats) per iteration
for ( ; (i + 63) < n; i += 64 )
for ( ; (i + 63) < n0; i += 64 )
{
__m512 xv[8], inter[8];
@@ -790,7 +842,7 @@ void bli_cscalv_zen_int_avx512
}
// Processing 32 scomplex elements (64 floats) per iteration
for ( ; (i + 31) < n; i += 32 )
for ( ; (i + 31) < n0; i += 32 )
{
__m512 xv[4], inter[4];
@@ -802,7 +854,7 @@ void bli_cscalv_zen_int_avx512
}
// Processing 16 scomplex elements (32 floats) per iteration
for ( ; (i + 15) < n; i += 16 )
for ( ; (i + 15) < n0; i += 16 )
{
__m512 xv[2], inter[2];
@@ -842,7 +894,7 @@ void bli_cscalv_zen_int_avx512
}
// Processing 8 scomplex elements (16 floats) per iteration
for ( ; (i + 7) < n; i += 8 )
for ( ; (i + 7) < n0; i += 8 )
{
__m512 xv[1], inter[1];
@@ -877,21 +929,23 @@ void bli_cscalv_zen_int_avx512
}
// Processing remaining elements, if any.
if ( i < n ) {
if ( i < n0 )
{
// Setting the mask bit based on remaining elements.
// Since each scomplex element corresponds to 2 floats,
// we need to load and store 2*(n-i) elements.
// we need to load and store 2*(n0-i) elements.
__mmask16 mask = ( 1 << ( 2 * ( n - i ) ) ) - 1;
__mmask16 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
__m512 xv, temp;
__m512 xv, inter;
xv = _mm512_maskz_loadu_ps( mask, x0 );
inter = _mm512_permute_ps( xv, 0xB1 );
temp = _mm512_permute_ps( xv, 0xB1 );
inter = _mm512_mul_ps( alphaIv, inter );
temp = _mm512_mul_ps( alphaIv, temp );
xv = _mm512_fmaddsub_ps( alphaRv, xv, inter );
xv = _mm512_fmaddsub_ps( alphaRv, xv, temp );
_mm512_mask_storeu_ps( x0, mask, xv );
}
@@ -902,7 +956,7 @@ void bli_cscalv_zen_int_avx512
const float alphaI = alpha_conj.imag;
float x0R, x0I;
for (; i < n; ++i)
for (; i < n0; ++i)
{
x0R = *(x0);
x0I = *(x0 + 1);
@@ -942,13 +996,14 @@ void bli_cscalv_zen_int_avx512
Deviation from BLAS
--------------------
None
Setv is used when alpha=0 unless a negative value of n is supplied.
This only occurs in calls from BLAS and CBLAS scal APIs.
Undefined behaviour
-------------------
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
is that these are standard BLAS exceptions and should be handled in a higher layer.
None
*/
void bli_zscalv_zen_int_avx512
(
@@ -960,17 +1015,13 @@ void bli_zscalv_zen_int_avx512
)
{
// If the vector dimension is zero, or if alpha is unit, return early.
if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
return;
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
/**
* @note Currently this kernel is not BLAS compliant. For BLAS compliance,
* the below call to SETV needs to be removed.
*/
if (PASTEMAC(z, eq0)(*alpha))
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
if (PASTEMAC(z,eq0)( *alpha ) && n > 0 )
{
// Expert interface of setv is invoked when alpha is zero
dcomplex *zero = PASTEMAC(z, 0);
dcomplex *zero = bli_z0;
/* When alpha is zero all the element in x are set to zero */
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
@@ -985,6 +1036,8 @@ void bli_zscalv_zen_int_avx512
return;
}
dim_t n0 = bli_abs(n);
dim_t i = 0;
dcomplex alpha_conj;
double *restrict x0 = (double *)x;
@@ -1022,7 +1075,7 @@ void bli_zscalv_zen_int_avx512
*/
// Processing 48 dcomplex elements per iteration.
for (; (i + 47) < n; i += 48)
for (; (i + 47) < n0; i += 48)
{
__m512d xv[12], temp[12];
@@ -1116,7 +1169,7 @@ void bli_zscalv_zen_int_avx512
}
// Processing 32 dcomplex elements per iteration.
for (; (i + 31) < n; i += 32)
for (; (i + 31) < n0; i += 32)
{
__m512d xv[8], temp[8];
xv[0] = _mm512_loadu_pd(x0);
@@ -1173,7 +1226,7 @@ void bli_zscalv_zen_int_avx512
}
// Processing 16 dcomplex elements per iteration.
for (; (i + 15) < n; i += 16)
for (; (i + 15) < n0; i += 16)
{
__m512d xv[4], temp[4];
xv[0] = _mm512_loadu_pd(x0);
@@ -1205,7 +1258,7 @@ void bli_zscalv_zen_int_avx512
}
// Processing 8 dcomplex elements per iteration.
for (; (i + 7) < n; i += 8)
for (; (i + 7) < n0; i += 8)
{
__m512d xv[2], temp[2];
xv[0] = _mm512_loadu_pd(x0);
@@ -1227,7 +1280,7 @@ void bli_zscalv_zen_int_avx512
}
// Processing 4 dcomplex elements per iteration.
for (; (i + 3) < n; i += 4)
for (; (i + 3) < n0; i += 4)
{
__m512d xv, temp;
xv = _mm512_loadu_pd(x0);
@@ -1244,23 +1297,24 @@ void bli_zscalv_zen_int_avx512
}
// Processing the remainder elements.
if( i < n )
if( i < n0 )
{
// Setting the mask bit based on remaining elements
// Since each dcomplex elements corresponds to 2 doubles
// we need to load and store 2*(m-i) elements.
__mmask8 mask = (1 << (2 * (n-i)) ) - 1;
// we need to load and store 2*(n0-i) elements.
__mmask8 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
__m512d xv, temp, zero;
zero = _mm512_setzero_pd();
xv = _mm512_mask_loadu_pd( zero, mask, x0 );
temp = _mm512_permute_pd(xv, 0x55);
temp = _mm512_permute_pd( xv, 0x55 );
temp = _mm512_mul_pd(alphaIv, temp);
temp = _mm512_mul_pd( alphaIv, temp );
xv = _mm512_fmaddsub_pd(alphaRv, xv, temp);
xv = _mm512_fmaddsub_pd( alphaRv, xv, temp );
_mm512_mask_storeu_pd( x0, mask, xv );
}
@@ -1272,7 +1326,7 @@ void bli_zscalv_zen_int_avx512
alphaRv = _mm_loaddup_pd(&alphaR);
alphaIv = _mm_loaddup_pd(&alphaI);
for (; i < n; ++i)
for (; i < n0; ++i)
{
x_vec = _mm_loadu_pd(x0);

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
\
/* If alpha is zero, use setv. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
if ( PASTEMAC(ch,eq0)( *alpha ) && n > 0) \
{ \
ctype* zero = PASTEMAC(ch,0); \
\
@@ -70,6 +71,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
); \
return; \
} \
\
dim_t n0 = bli_abs(n); \
\
ctype alpha_conj; \
\
@@ -78,14 +81,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
if ( incx == 1 ) \
{ \
PRAGMA_SIMD \
for ( dim_t i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n0; ++i ) \
{ \
PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
} \
} \
else \
{ \
for ( dim_t i = 0; i < n; ++i ) \
for ( dim_t i = 0; i < n0; ++i ) \
{ \
PASTEMAC(ch,scals)( alpha_conj, *x ); \
\