mirror of
https://github.com/amd/blis.git
synced 2026-07-03 05:37:51 +00:00
SCALV alpha=zero BLAS compliance
SCALV is used directly by BLAS, CBLAS and BLIS scal{v} APIs but
also within many other APIs to handle special cases. In general
it is preferred to use SETV when alpha=0, but BLAS and CBLAS
continue to multiple all vector element by alpha. This has
different behaviour for propagating NaNs or Infs.
Changes in this commit:
- Standardize early returns from SCALV reference and optimized
kernels.
- User supplied N<0 is handled at the top level API layer. Use
negative values of N in kernel calls to signify that SETV
should _not_ be used when alpha=0. This should only be
required in SCALV.
- Include serial threshold in zdscal (as in dscal) to reduce
overhead for small problem sizes.
- Code tidying to make different variants more consistent.
- More standardization of tests in SCALV gtestsuite programs.
- Remove scalv_extreme_cases.cpp as it is now redundant.
AMD-Internal: [CPUPL-4415]
Change-Id: I42e98875ceaea224cc98d0cdfe0133c9abc3edae
(cherry picked from commit a07e041b1f)
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -40,7 +40,7 @@
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNCSCAL
|
||||
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
|
||||
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF772S(chx,cha,blasname) \
|
||||
( \
|
||||
@@ -50,44 +50,42 @@ void PASTEF772S(chx,cha,blasname) \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
dim_t n0; \
|
||||
ftype_x* x0; \
|
||||
inc_t incx0; \
|
||||
ftype_x alpha_cast; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
if (*n == 0 || alpha == NULL) { \
|
||||
dim_t n0 = (dim_t)(*n); \
|
||||
ftype_x *x0 = x; \
|
||||
inc_t incx0 = (inc_t)(*incx); \
|
||||
\
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
return ; \
|
||||
} \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
|
||||
\
|
||||
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
|
||||
that is, we just always sub-optimally implement those cases
|
||||
by casting alpha to ctype_x (potentially the complex domain) and
|
||||
using the homogeneous datatype instance according to that type. */ \
|
||||
ftype_x alpha_cast; \
|
||||
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
|
||||
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
-n0, \
|
||||
&alpha_cast, \
|
||||
x0, incx0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}\
|
||||
|
||||
@@ -50,13 +50,16 @@
|
||||
|
||||
1. When alpha == NaN - Propogate the NaN to the vector
|
||||
2. When alpha == 0 - Perform the SCALV operation completely and don't use setv.
|
||||
As SCALV kernels are used in many other BLAS APIs where we want setv to be
|
||||
used in this scenario, here we call the kernels with n=-n to signify that
|
||||
setv should not be used.
|
||||
*/
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTFUNCSCAL
|
||||
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
|
||||
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, chau, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF772S(chx,cha,blasname) \
|
||||
( \
|
||||
@@ -66,55 +69,42 @@ void PASTEF772S(chx,cha,blasname) \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
dim_t n0; \
|
||||
ftype_x* x0; \
|
||||
inc_t incx0; \
|
||||
ftype_x alpha_cast; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
if (*n == 0 || alpha == NULL) { \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
return ; \
|
||||
} \
|
||||
dim_t n0 = (dim_t)(*n); \
|
||||
ftype_x *x0 = x; \
|
||||
inc_t incx0 = (inc_t)(*incx); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are less than or equal to zero, return. */ \
|
||||
if ( (*incx) <= 0 ) { \
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
return ; \
|
||||
} else { \
|
||||
incx0 = ( inc_t )(*incx); \
|
||||
x0 = (x); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
|
||||
that is, we just always sub-optimally implement those cases
|
||||
by casting alpha to ctype_x (potentially the complex domain) and
|
||||
using the homogeneous datatype instance according to that type. */ \
|
||||
ftype_x alpha_cast; \
|
||||
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
|
||||
\
|
||||
/* If alpha is a unit scalar, return early. */ \
|
||||
if ( PASTEMAC(c, eq1)(alpha_cast) ) { \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
return ; \
|
||||
} \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
/* Pass size as negative to stipulate don't use SETV when alpha=0 */ \
|
||||
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
-n0, \
|
||||
&alpha_cast, \
|
||||
x0, incx0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}\
|
||||
@@ -139,82 +129,72 @@ void sscal_blis_impl
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', (void *) alpha, *n, *incx );
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
inc_t incx0;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha))
|
||||
dim_t n0 = (dim_t)(*n);
|
||||
float *x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/*
|
||||
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
|
||||
Return early when alpha pointer is NULL - BLIS exception
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are less than or equal to zero, return. */
|
||||
if ( (*incx) <= 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return ;
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
// Definition of function pointer
|
||||
sscalv_ker_ft scalv_ker_ptr;
|
||||
|
||||
cntx_t *cntx = NULL;
|
||||
|
||||
// Query the architecture ID
|
||||
arch_t id = bli_arch_query_id();
|
||||
|
||||
/*
|
||||
Function pointer declaration for the function
|
||||
that will be used by this API
|
||||
*/
|
||||
sscalv_ker_ft scalv_ker_ptr; // DSCALV
|
||||
|
||||
// Pick the kernel based on the architecture ID
|
||||
switch (id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
|
||||
|
||||
break;
|
||||
scalv_ker_ptr = bli_sscalv_zen_int_avx512;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
scalv_ker_ptr = bli_sscalv_zen_int10;
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
scalv_ker_ptr = bli_sscalv_zen_int10;
|
||||
break;
|
||||
|
||||
break;
|
||||
default:
|
||||
default:
|
||||
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
// Query the context for the kernel function pointers for sscalv
|
||||
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
|
||||
// Query the context for the kernel function pointers for sscalv
|
||||
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
|
||||
}
|
||||
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
-n0,
|
||||
(float *)alpha,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
}
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void sscal_
|
||||
@@ -224,7 +204,7 @@ void sscal_
|
||||
float* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
sscal_blis_impl( n, alpha, x, incx );
|
||||
sscal_blis_impl( n, alpha, x, incx );
|
||||
}
|
||||
#endif
|
||||
void dscal_blis_impl
|
||||
@@ -236,65 +216,54 @@ void dscal_blis_impl
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
|
||||
dim_t n_elem;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
dim_t ST_THRESH = 30000;
|
||||
#endif
|
||||
double* x0;
|
||||
inc_t incx0;
|
||||
|
||||
/* Initialize BLIS */
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
/* Convert typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n_elem = ( dim_t )0;
|
||||
else n_elem = ( dim_t )(*n);
|
||||
dim_t n0 = (dim_t)(*n);
|
||||
double *x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/*
|
||||
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
|
||||
Return early when alpha pointer is NULL - BLIS exception
|
||||
*/
|
||||
if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha))
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are less than or equal to zero, return. */
|
||||
if ( (*incx) <= 0 )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return ;
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = (x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// Definition of function pointer
|
||||
// Definition of function pointer
|
||||
dscalv_ker_ft scalv_ker_ptr;
|
||||
|
||||
cntx_t *cntx = NULL;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
dim_t ST_THRESH = 30000;
|
||||
#endif
|
||||
|
||||
// Query the architecture ID
|
||||
arch_t arch_id_local = bli_arch_query_id();
|
||||
arch_t id = bli_arch_query_id();
|
||||
|
||||
// Pick the kernel based on the architecture ID
|
||||
switch (arch_id_local)
|
||||
switch (id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_dscalv_zen_int_avx512;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
ST_THRESH = 30000;
|
||||
ST_THRESH = 30000;
|
||||
#endif
|
||||
break;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_ker_ptr = bli_dscalv_zen_int10;
|
||||
@@ -303,9 +272,9 @@ void dscal_blis_impl
|
||||
#endif
|
||||
break;
|
||||
|
||||
default:
|
||||
default:
|
||||
|
||||
// Query the context
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
// Query the function pointer using the context
|
||||
@@ -315,25 +284,28 @@ void dscal_blis_impl
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
If the optimial number of threads is 1, the OpenMP and
|
||||
'bli_nthreads_l1'overheads are avoided by calling the
|
||||
If the optimal number of threads is 1, the OpenMP and
|
||||
'bli_nthreads_l1' overheads are avoided by calling the
|
||||
function directly. This ensures that performance of dscalv
|
||||
does not drop for single thread when OpenMP is enabled.
|
||||
*/
|
||||
if (n_elem <= ST_THRESH)
|
||||
if (n0 <= ST_THRESH)
|
||||
{
|
||||
#endif
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
-n0,
|
||||
(double *)alpha,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
}
|
||||
@@ -354,14 +326,14 @@ void dscal_blis_impl
|
||||
BLIS_SCALV_KER,
|
||||
BLIS_DOUBLE,
|
||||
BLIS_DOUBLE,
|
||||
arch_id_local,
|
||||
n_elem,
|
||||
id,
|
||||
n0,
|
||||
&nt
|
||||
);
|
||||
|
||||
_Pragma("omp parallel num_threads(nt)")
|
||||
{
|
||||
dim_t start, end, length;
|
||||
dim_t start, end, length;
|
||||
thrinfo_t thrinfo_vec;
|
||||
|
||||
// The block size is the minimum factor, whose multiple will ensure that only
|
||||
@@ -383,7 +355,7 @@ void dscal_blis_impl
|
||||
bli_thread_range_sub
|
||||
(
|
||||
&thrinfo_vec,
|
||||
n_elem,
|
||||
n0,
|
||||
block_size,
|
||||
FALSE,
|
||||
&start,
|
||||
@@ -396,22 +368,21 @@ void dscal_blis_impl
|
||||
double *x_thread_local = x0 + (start * incx0);
|
||||
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
length,
|
||||
-length,
|
||||
(double *)alpha,
|
||||
x_thread_local, incx0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
#endif
|
||||
|
||||
}
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void dscal_
|
||||
@@ -421,7 +392,7 @@ void dscal_
|
||||
double* x, const f77_int* incx
|
||||
)
|
||||
{
|
||||
dscal_blis_impl( n, alpha, x, incx );
|
||||
dscal_blis_impl( n, alpha, x, incx );
|
||||
}
|
||||
#endif
|
||||
void zdscal_blis_impl
|
||||
@@ -433,19 +404,23 @@ void zdscal_blis_impl
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx );
|
||||
dim_t n_elem = (dim_t)(*n);
|
||||
dcomplex* x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
dim_t n0 = (dim_t)(*n);
|
||||
dcomplex* x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/*
|
||||
When n is zero or the alpha pointer passed is null
|
||||
or the incx is zero or alpha is 1, return early.
|
||||
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
|
||||
Return early when alpha pointer is NULL - BLIS exception
|
||||
*/
|
||||
if ((n_elem <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -458,30 +433,34 @@ void zdscal_blis_impl
|
||||
|
||||
cntx_t *cntx = NULL;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
dim_t ST_THRESH = 10000;
|
||||
#endif
|
||||
|
||||
// Query the architecture ID
|
||||
arch_t arch_id_local = bli_arch_query_id();
|
||||
arch_t id = bli_arch_query_id();
|
||||
|
||||
// Pick the kernel based on the architecture ID
|
||||
switch (arch_id_local)
|
||||
switch (id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_zdscalv_zen_int_avx512;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_ker_ptr = bli_zdscalv_zen_int10;
|
||||
break;
|
||||
|
||||
default:
|
||||
default:
|
||||
|
||||
// Query the context
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
// Query the function pointer using the context
|
||||
@@ -489,6 +468,32 @@ void zdscal_blis_impl
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
If the optimal number of threads is 1, the OpenMP and
|
||||
'bli_nthreads_l1' overheads are avoided by calling the
|
||||
function directly. This ensures that performance of dscalv
|
||||
does not drop for single thread when OpenMP is enabled.
|
||||
*/
|
||||
if (n0 <= ST_THRESH)
|
||||
{
|
||||
#endif
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
-n0,
|
||||
(dcomplex *)&alpha_cast,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
}
|
||||
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
@@ -506,33 +511,11 @@ void zdscal_blis_impl
|
||||
BLIS_SCALV_KER,
|
||||
BLIS_DCOMPLEX,
|
||||
BLIS_DOUBLE,
|
||||
arch_id_local,
|
||||
n_elem,
|
||||
id,
|
||||
n0,
|
||||
&nt
|
||||
);
|
||||
|
||||
/*
|
||||
If the number of optimum threads is 1, the OpenMP overhead
|
||||
is avoided by calling the function directly
|
||||
*/
|
||||
if (nt == 1)
|
||||
{
|
||||
#endif
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
(dcomplex *)&alpha_cast,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
|
||||
return;
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
}
|
||||
|
||||
_Pragma("omp parallel num_threads(nt)")
|
||||
{
|
||||
dim_t start, length;
|
||||
@@ -549,7 +532,7 @@ void zdscal_blis_impl
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n_elem,
|
||||
n0,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
@@ -559,18 +542,21 @@ void zdscal_blis_impl
|
||||
dcomplex *x_thread_local = x0 + (start * incx0);
|
||||
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
length,
|
||||
-length,
|
||||
(dcomplex *)&alpha_cast,
|
||||
x_thread_local, incx0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
#endif
|
||||
}
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void zdscal_
|
||||
@@ -594,22 +580,27 @@ void cscal_blis_impl
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'C', (void *)alpha, *n, *incx);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
dim_t n0 = (dim_t)(*n);
|
||||
scomplex *x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/*
|
||||
When n is zero or the alpha pointer passed is null
|
||||
or the incx is zero or alpha is 1, return early.
|
||||
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
|
||||
Return early when alpha pointer is NULL - BLIS exception
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// Definition of function pointer
|
||||
cscalv_ker_ft scalv_fun_ptr;
|
||||
cscalv_ker_ft scalv_ker_ptr;
|
||||
|
||||
cntx_t* cntx = NULL;
|
||||
|
||||
@@ -622,40 +613,42 @@ void cscal_blis_impl
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_fun_ptr = bli_cscalv_zen_int_avx512;
|
||||
break;
|
||||
// AVX512 Kernel
|
||||
scalv_ker_ptr = bli_cscalv_zen_int_avx512;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_fun_ptr = bli_cscalv_zen_int;
|
||||
break;
|
||||
// AVX2 Kernel
|
||||
scalv_ker_ptr = bli_cscalv_zen_int;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
// Query the context
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
// Query the function pointer using the context
|
||||
scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
|
||||
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_SCOMPLEX, BLIS_SCALV_KER, cntx);
|
||||
}
|
||||
|
||||
// Call the function based on the function pointer assigned above
|
||||
scalv_fun_ptr
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
-n0,
|
||||
(scomplex*) alpha,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void cscal_
|
||||
(
|
||||
@@ -678,22 +671,27 @@ void zscal_blis_impl
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx);
|
||||
|
||||
/* Initialize BLIS. */
|
||||
//bli_init_auto();
|
||||
|
||||
dim_t n0 = (dim_t)(*n);
|
||||
dcomplex *x0 = x;
|
||||
inc_t incx0 = (inc_t)(*incx);
|
||||
|
||||
/*
|
||||
When n is zero or the alpha pointer passed is null
|
||||
or the incx is zero or alpha is 1, return early.
|
||||
Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
|
||||
Return early when alpha pointer is NULL - BLIS exception
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
// Definition of function pointer
|
||||
zscalv_ker_ft scalv_fun_ptr;
|
||||
zscalv_ker_ft scalv_ker_ptr;
|
||||
|
||||
cntx_t* cntx = NULL;
|
||||
|
||||
@@ -707,7 +705,7 @@ void zscal_blis_impl
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
// AVX512 Kernel
|
||||
scalv_fun_ptr = bli_zscalv_zen_int_avx512;
|
||||
scalv_ker_ptr = bli_zscalv_zen_int_avx512;
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
@@ -715,29 +713,32 @@ void zscal_blis_impl
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
// AVX2 Kernel
|
||||
scalv_fun_ptr = bli_zscalv_zen_int;
|
||||
scalv_ker_ptr = bli_zscalv_zen_int;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
// Query the context
|
||||
// For non-Zen architectures, query the context
|
||||
cntx = bli_gks_query_cntx();
|
||||
|
||||
// Query the function pointer using the context
|
||||
scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
|
||||
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
|
||||
}
|
||||
|
||||
// Call the function based on the function pointer assigned above
|
||||
scalv_fun_ptr
|
||||
// Invoke the function based on the kernel function pointer
|
||||
// Pass size as negative to stipulate don't use SETV when alpha=0
|
||||
scalv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
-n0,
|
||||
(dcomplex*) alpha,
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
//bli_finalize_auto();
|
||||
}
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void zscal_
|
||||
@@ -751,4 +752,4 @@ void zscal_
|
||||
}
|
||||
#endif
|
||||
|
||||
GENTFUNCSCAL( scomplex, float, c, s, scal, scalv )
|
||||
GENTFUNCSCAL( scomplex, float, c, s, s, scal, scalv )
|
||||
|
||||
@@ -174,12 +174,12 @@ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname )
|
||||
|
||||
#define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
|
||||
\
|
||||
GENTFUNCSCAL( float, float, s, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( double, double, d, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname )
|
||||
GENTFUNCSCAL( float, float, s, , s, blasname, blisname ) \
|
||||
GENTFUNCSCAL( double, double, d, , d, blasname, blisname ) \
|
||||
GENTFUNCSCAL( scomplex, scomplex, c, , c, blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, dcomplex, z, , z, blasname, blisname ) \
|
||||
GENTFUNCSCAL( scomplex, float, c, s, s, blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, double, z, d, d, blasname, blisname )
|
||||
|
||||
// --GEMMT specific kernels ----------------------------------------------------
|
||||
|
||||
|
||||
@@ -36,10 +36,10 @@
|
||||
#include "test_scalv.h"
|
||||
|
||||
class cscalvGeneric :
|
||||
public ::testing::TestWithParam<std::tuple<char,
|
||||
gtint_t,
|
||||
gtint_t,
|
||||
scomplex>> {};
|
||||
public ::testing::TestWithParam<std::tuple<char, // conj_alpha
|
||||
gtint_t, // n
|
||||
gtint_t, // incx
|
||||
scomplex>> {}; // alpha
|
||||
|
||||
|
||||
// Tests using random integers as vector elements.
|
||||
@@ -78,42 +78,140 @@ TEST_P( cscalvGeneric, API )
|
||||
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
|
||||
}
|
||||
|
||||
// Black box testing for generic and main use of cscal.
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
Blackbox,
|
||||
unitPositiveIncrementSmall,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(1)), // stride size for x
|
||||
::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}) // alpha
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
scomplex{-5.1, -7.3},
|
||||
scomplex{ 1.0, 1.0},
|
||||
scomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrementLarge,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
scomplex{-5.1, -7.3},
|
||||
scomplex{ 1.0, 1.0},
|
||||
scomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
|
||||
// Test for non-unit increments.
|
||||
// Only test very few cases as sanity check.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementSmall,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
scomplex{-5.1, -7.3},
|
||||
scomplex{ 1.0, 1.0},
|
||||
scomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
scomplex{-5.1, -7.3},
|
||||
scomplex{ 1.0, 1.0},
|
||||
scomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
scomplex{ 0.0, 0.0}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
|
||||
// Only test very few cases as sanity check since conj(x) = x for real types.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NonUnitPositiveIncrements,
|
||||
conjalpha,
|
||||
cscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x
|
||||
::testing::Values(scomplex{4.0, 3.1}) // alpha
|
||||
::testing::Values('c'), // c: use conjugate
|
||||
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(1)), // stride size for x
|
||||
::testing::Values(scomplex{ 7.3, 5.1}) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// Test for negative increments.
|
||||
@@ -126,7 +224,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('n'), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
|
||||
::testing::Values(scomplex{4.0, 3.1}) // alpha
|
||||
::testing::Values(scomplex{ 7.3, 5.1}) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<scomplex, scomplex>())
|
||||
);
|
||||
|
||||
219
gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
Normal file
219
gtestsuite/testsuite/level1/scalv/csscalv_generic.cpp
Normal file
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "test_scalv.h"
|
||||
|
||||
class csscalvGeneric :
|
||||
public ::testing::TestWithParam<std::tuple<char, // conj_alpha
|
||||
gtint_t, // n
|
||||
gtint_t, // incx
|
||||
float>> {}; // alpha
|
||||
|
||||
GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csscalvGeneric);
|
||||
|
||||
// Tests using random integers as vector elements.
|
||||
TEST_P( csscalvGeneric, API )
|
||||
{
|
||||
using T = scomplex;
|
||||
using U = float;
|
||||
//----------------------------------------------------------
|
||||
// Initialize values from the parameters passed through
|
||||
// test suite instantiation (INSTANTIATE_TEST_SUITE_P).
|
||||
//----------------------------------------------------------
|
||||
// denotes whether alpha or conj(alpha) will be used:
|
||||
char conj_alpha = std::get<0>(GetParam());
|
||||
// vector length:
|
||||
gtint_t n = std::get<1>(GetParam());
|
||||
// stride size for x:
|
||||
gtint_t incx = std::get<2>(GetParam());
|
||||
// alpha
|
||||
U alpha = std::get<3>(GetParam());
|
||||
|
||||
// Set the threshold for the errors:
|
||||
// Check gtestsuite scalv.h or netlib source code for reminder of the
|
||||
// functionality from which we estimate operation count per element
|
||||
// of output, and hence the multipler for epsilon.
|
||||
// No adjustment applied yet for complex data.
|
||||
double thresh;
|
||||
if (n == 0)
|
||||
thresh = 0.0;
|
||||
else if (alpha == testinghelpers::ZERO<U>() || alpha == testinghelpers::ONE<U>())
|
||||
thresh = 0.0;
|
||||
else
|
||||
thresh = testinghelpers::getEpsilon<T>();
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Call generic test body using those parameters
|
||||
//----------------------------------------------------------
|
||||
test_scalv<T, U>( conj_alpha, n, incx, alpha, thresh );
|
||||
}
|
||||
|
||||
// bli_csscal not present in BLIS
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrementSmall,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrementLarge,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementSmall,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
// Test for negative increments.
|
||||
// Only test very few cases as sanity check.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NegativeIncrements,
|
||||
csscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
|
||||
::testing::Values(3) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
#endif // not TEST_BLIS_TYPED
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -79,20 +79,41 @@ TEST_P( dscalvGeneric, API )
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrement,
|
||||
unitPositiveIncrementSmall,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrementLarge,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0),
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
@@ -101,21 +122,20 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrement,
|
||||
nonUnitPositiveIncrementSmall,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(3)
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0),
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
@@ -123,6 +143,54 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
|
||||
// Only test very few cases as sanity check since conj(x) = x for real types.
|
||||
@@ -140,6 +208,23 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// Test for negative increments.
|
||||
// Only test very few cases as sanity check.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NegativeIncrements,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
|
||||
::testing::Values(3) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
AOCLDynamic,
|
||||
@@ -151,6 +236,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values(
|
||||
gtint_t( 30000), // nt_ideal = 1
|
||||
gtint_t( 100000), // nt_ideal = 2
|
||||
gtint_t( 486919), // nt_ideal = 8
|
||||
gtint_t( 500000), // nt_ideal = 8
|
||||
gtint_t( 2500000), // nt_ideal = 12
|
||||
gtint_t( 4000000), // nt_ideal = 16
|
||||
@@ -160,7 +246,8 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
gtint_t(1),
|
||||
gtint_t(3)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
@@ -169,4 +256,34 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
AOCLDynamicAlphaZero,
|
||||
dscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t( 89), // nt_ideal = 8
|
||||
gtint_t( 486919), // nt_ideal = 8
|
||||
gtint_t(25000000) // nt_ideal = max_available
|
||||
),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(3)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif // BLIS_ENABLE_OPENMP && AOCL_DYNAMIC
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "test_scalv.h"
|
||||
|
||||
template <typename T>
|
||||
class scalv_EIC : public ::testing::Test {};
|
||||
typedef ::testing::Types<float, double> TypeParam;
|
||||
TYPED_TEST_SUITE(scalv_EIC, TypeParam);
|
||||
|
||||
TYPED_TEST(scalv_EIC, zero_alpha_x_fp)
|
||||
{
|
||||
using T = TypeParam;
|
||||
gtint_t n = 10, incx = 1;
|
||||
std::vector<T> x(n);
|
||||
// Initialize x with random numbers.
|
||||
testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
|
||||
std::vector<T> x_ref(x);
|
||||
T alpha = T{0};
|
||||
|
||||
testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
|
||||
//----------------------------------------------------------
|
||||
// Call BLIS function.
|
||||
//----------------------------------------------------------
|
||||
scalv<T>('n', n, alpha, x.data(), incx);
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Compute component-wise error.
|
||||
//----------------------------------------------------------
|
||||
// Set the threshold for the errors:
|
||||
// Check gtestsuite scalv.h or netlib source code for reminder of the
|
||||
// functionality from which we estimate operation count per element
|
||||
// of output, and hence the multipler for epsilon.
|
||||
double thresh;
|
||||
if (n == 0)
|
||||
thresh = 0.0;
|
||||
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
|
||||
thresh = 0.0;
|
||||
else
|
||||
thresh = testinghelpers::getEpsilon<T>();
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Call generic test body using those parameters
|
||||
//----------------------------------------------------------
|
||||
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
|
||||
}
|
||||
|
||||
TYPED_TEST(scalv_EIC, zero_alpha_x_inf)
|
||||
{
|
||||
using T = TypeParam;
|
||||
gtint_t n = 10, incx = 1;
|
||||
std::vector<T> x(n);
|
||||
// Initialize x with random numbers.
|
||||
testinghelpers::datagenerators::randomgenerators<T>( -10, 10, n, incx, x.data() );
|
||||
x[3] = 1.0/0.0;
|
||||
std::vector<T> x_ref(x);
|
||||
T alpha = T{0};
|
||||
testinghelpers::ref_scalv<T, T>('n', n, alpha, x_ref.data(), incx);
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Call BLIS function.
|
||||
//----------------------------------------------------------
|
||||
scalv<T>('n', n, alpha, x.data(), incx);
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Compute component-wise error.
|
||||
//----------------------------------------------------------
|
||||
// Set the threshold for the errors:
|
||||
// Check gtestsuite scalv.h or netlib source code for reminder of the
|
||||
// functionality from which we estimate operation count per element
|
||||
// of output, and hence the multipler for epsilon.
|
||||
// No adjustment applied yet for complex data.
|
||||
double thresh;
|
||||
if (n == 0)
|
||||
thresh = 0.0;
|
||||
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
|
||||
thresh = 0.0;
|
||||
else
|
||||
thresh = testinghelpers::getEpsilon<T>();
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Call generic test body using those parameters
|
||||
//----------------------------------------------------------
|
||||
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
|
||||
}
|
||||
@@ -63,7 +63,7 @@ TEST_P( sscalvGeneric, API )
|
||||
// Check gtestsuite scalv.h or netlib source code for reminder of the
|
||||
// functionality from which we estimate operation count per element
|
||||
// of output, and hence the multipler for epsilon.
|
||||
double thresh;
|
||||
float thresh;
|
||||
if (n == 0)
|
||||
thresh = 0.0;
|
||||
else if (alpha == testinghelpers::ZERO<T>() || alpha == testinghelpers::ONE<T>())
|
||||
@@ -77,19 +77,120 @@ TEST_P( sscalvGeneric, API )
|
||||
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
|
||||
}
|
||||
|
||||
// Black box testing for generic and main use of sscal.
|
||||
// Black box testing for generic use of sscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
Blackbox,
|
||||
unitPositiveIncrementSmall,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x, not conj(x) (since it is real)
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(1)), // stride size for x
|
||||
::testing::Values(float(3.0), float(-5.0)) // alpha
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrementLarge,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementSmall,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(17), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 7.0),
|
||||
float(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
float( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
|
||||
// Only test very few cases as sanity check since conj(x) = x for real types.
|
||||
@@ -101,28 +202,12 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Values('c'), // c: use conjugate
|
||||
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(1)), // stride size for x
|
||||
::testing::Values(float(9.0)) // alpha
|
||||
::testing::Values(float(-3.0)) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
#endif
|
||||
|
||||
// Test for non-unit increments.
|
||||
// Only test very few cases as sanity check.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NonUnitPositiveIncrements,
|
||||
sscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x
|
||||
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(2), gtint_t(11)), //(gtint_t(-5), gtint_t(-17)) // stride size for x
|
||||
::testing::Values(float(2.0)) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<float, float>())
|
||||
);
|
||||
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// Test for negative increments.
|
||||
// Only test very few cases as sanity check.
|
||||
|
||||
@@ -48,6 +48,8 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
|
||||
// Initialize vector with random numbers.
|
||||
//----------------------------------------------------------
|
||||
std::vector<T> x = testinghelpers::get_random_vector<T>( -10, 10, n, incx );
|
||||
if (alpha == testinghelpers::ZERO<U>())
|
||||
testinghelpers::set_vector( n, incx, x.data(), testinghelpers::aocl_extreme<T>() );
|
||||
|
||||
//----------------------------------------------------------
|
||||
// Call reference implementation to get ref results.
|
||||
@@ -64,7 +66,7 @@ static void test_scalv( char conja_alpha, gtint_t n, gtint_t incx, U alpha, doub
|
||||
//----------------------------------------------------------
|
||||
// Compute component-wise error.
|
||||
//----------------------------------------------------------
|
||||
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh );
|
||||
computediff<T>( "x", n, x.data(), x_ref.data(), incx, thresh, true );
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -82,59 +82,187 @@ TEST_P( zdscalvGeneric, API )
|
||||
|
||||
// bli_zdscal not present in BLIS
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// Black box testing for zdscal.
|
||||
// Tests with unit-positive increment.
|
||||
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrement,
|
||||
unitPositiveIncrementSmall,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
),
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(gtint_t(1)),
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double(-5.1),
|
||||
double( 0.0),
|
||||
double( 7.3)
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, double>())
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
|
||||
// Tests for non-unit increments.
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrement,
|
||||
unitPositiveIncrementLarge,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
),
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementSmall,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 7.0),
|
||||
double(-3.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
// Test for negative increments.
|
||||
// Only test very few cases as sanity check.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NegativeIncrements,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
|
||||
::testing::Values(3) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
AOCLDynamic,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t( 10000), // nt_ideal = 1
|
||||
gtint_t( 20000), // nt_ideal = 4
|
||||
gtint_t( 486919), // nt_ideal = 8
|
||||
gtint_t( 1000000), // nt_ideal = 8
|
||||
gtint_t( 2500000), // nt_ideal = 12
|
||||
gtint_t( 5000000), // nt_ideal = 32
|
||||
gtint_t( 7000000) // nt_ideal = max_available
|
||||
),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(3)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double(-5.1),
|
||||
double( 0.0),
|
||||
double( 7.3)
|
||||
double( 7.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, double>())
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
AOCLDynamicAlphaZero,
|
||||
zdscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(
|
||||
gtint_t( 486919), // nt_ideal = 8
|
||||
gtint_t( 7000000) // nt_ideal = max_available
|
||||
),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(3)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
double( 0.0)
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<double, double>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif // not TEST_BLIS_TYPED
|
||||
|
||||
@@ -78,26 +78,22 @@ TEST_P( zscalvGeneric, API )
|
||||
test_scalv<T>( conj_alpha, n, incx, alpha, thresh );
|
||||
}
|
||||
|
||||
// Black box testing for zscal.
|
||||
// Tests with unit-positive increment.
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
unitPositiveIncrement,
|
||||
unitPositiveIncrementSmall,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
),
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(gtint_t(1)),
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
dcomplex{-5.1, -7.3},
|
||||
dcomplex{ 0.0, 0.0},
|
||||
dcomplex{ 1.0, 1.0},
|
||||
dcomplex{ 7.3, 5.1}
|
||||
)
|
||||
@@ -105,32 +101,131 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
|
||||
|
||||
// Test for non-unit increments.
|
||||
// Black box testing for generic use of dscal.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrement,
|
||||
unitPositiveIncrementLarge,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
, 'c' // this option is BLIS-api specific.
|
||||
#endif
|
||||
),
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(10), gtint_t(101), 10),
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
dcomplex{-5.1, -7.3},
|
||||
dcomplex{ 1.0, 1.0},
|
||||
dcomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementSmall,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(9), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(3)
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
dcomplex{-5.1, -7.3},
|
||||
dcomplex{ 0.0, 0.0},
|
||||
dcomplex{ 1.0, 1.0},
|
||||
dcomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
nonUnitPositiveIncrementLarge,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Values(gtint_t(111), gtint_t(193), gtint_t(403)),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
dcomplex{-5.1, -7.3},
|
||||
dcomplex{ 1.0, 1.0},
|
||||
dcomplex{ 7.3, 5.1}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// alpha=0 testing only for BLAS and CBLAS as
|
||||
// BLIS uses setv and won't propagate Inf and NaNs
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
alphaZero,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
// conj(alpha): uses n (no_conjugate) since it is real.
|
||||
::testing::Values('n'),
|
||||
// m: size of vector.
|
||||
::testing::Range(gtint_t(1), gtint_t(101), 1),
|
||||
// incx: stride of x vector.
|
||||
::testing::Values(
|
||||
gtint_t(1),
|
||||
gtint_t(2),
|
||||
gtint_t(41)
|
||||
),
|
||||
// alpha: value of scalar.
|
||||
::testing::Values(
|
||||
dcomplex{ 0.0, 0.0}
|
||||
)
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifdef TEST_BLIS_TYPED
|
||||
// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
|
||||
// Only test very few cases as sanity check since conj(x) = x for real types.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
conjalpha,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('c'), // c: use conjugate
|
||||
::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(1)), // stride size for x
|
||||
::testing::Values(dcomplex{ 7.3, 5.1}) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
#endif
|
||||
|
||||
#ifndef TEST_BLIS_TYPED
|
||||
// Test for negative increments.
|
||||
// Only test very few cases as sanity check.
|
||||
// We can modify the values using implementantion details.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NegativeIncrements,
|
||||
zscalvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('n'), // n: use x, c: use conj(x)
|
||||
::testing::Range(gtint_t(10), gtint_t(31), 10), // m size of vector takes values from 10 to 100 with step size of 10.
|
||||
::testing::Values(gtint_t(-2), gtint_t(-1)), // stride size for x
|
||||
::testing::Values(dcomplex{ 7.3, 5.1}) // alpha
|
||||
),
|
||||
(::scalvGenericPrint<dcomplex, dcomplex>())
|
||||
);
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2017 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -80,9 +80,11 @@ void bli_sscalv_zen_int
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
if (cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
@@ -96,10 +98,12 @@ void bli_sscalv_zen_int
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
@@ -107,7 +111,7 @@ void bli_sscalv_zen_int
|
||||
if ( incx != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
n_left = n0;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
@@ -178,10 +182,11 @@ void bli_dscalv_zen_int
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
if (cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
@@ -195,10 +200,12 @@ void bli_dscalv_zen_int
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
n_viter = ( n0 ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n0 ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
@@ -206,7 +213,7 @@ void bli_dscalv_zen_int
|
||||
if ( incx != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
n_left = n0;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
|
||||
@@ -60,8 +60,8 @@ void bli_sscalv_zen_int10
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
@@ -78,6 +78,8 @@ void bli_sscalv_zen_int10
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
@@ -88,11 +90,11 @@ void bli_sscalv_zen_int10
|
||||
dim_t option;
|
||||
|
||||
// Unroll and the loop used is picked based on the input size.
|
||||
if( n < 300)
|
||||
if( n0 < 300)
|
||||
{
|
||||
option = 2;
|
||||
}
|
||||
else if( n < 500)
|
||||
else if( n0 < 500)
|
||||
{
|
||||
option = 1;
|
||||
}
|
||||
@@ -105,7 +107,7 @@ void bli_sscalv_zen_int10
|
||||
{
|
||||
case 0:
|
||||
|
||||
for ( ; (i + 127) < n; i += 128 )
|
||||
for ( ; (i + 127) < n0; i += 128 )
|
||||
{
|
||||
//Load the input values
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
@@ -175,7 +177,7 @@ void bli_sscalv_zen_int10
|
||||
|
||||
case 1 :
|
||||
|
||||
for ( ; (i + 95) < n; i += 96 )
|
||||
for ( ; (i + 95) < n0; i += 96 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
@@ -227,7 +229,7 @@ void bli_sscalv_zen_int10
|
||||
|
||||
case 2:
|
||||
|
||||
for ( ; (i + 47) < n; i += 48 )
|
||||
for ( ; (i + 47) < n0; i += 48 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
@@ -256,7 +258,7 @@ void bli_sscalv_zen_int10
|
||||
x0 += 6*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 23) < n; i += 24 )
|
||||
for ( ; (i + 23) < n0; i += 24 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
@@ -273,7 +275,7 @@ void bli_sscalv_zen_int10
|
||||
x0 += 3*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
for ( ; (i + 7) < n0; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
@@ -284,7 +286,7 @@ void bli_sscalv_zen_int10
|
||||
x0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
for ( ; (i + 0) < n0; i += 1 )
|
||||
{
|
||||
*x0 *= *alpha;
|
||||
|
||||
@@ -296,7 +298,7 @@ void bli_sscalv_zen_int10
|
||||
{
|
||||
const float alphac = *alpha;
|
||||
|
||||
for ( ; i < n; ++i )
|
||||
for ( ; i < n0; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
@@ -329,8 +331,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
@@ -348,6 +350,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
@@ -358,11 +362,11 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
dim_t option;
|
||||
|
||||
// Unroll and the loop used is picked based on the input size.
|
||||
if(n < 200)
|
||||
if(n0 < 200)
|
||||
{
|
||||
option = 2;
|
||||
}
|
||||
else if(n < 500)
|
||||
else if(n0 < 500)
|
||||
{
|
||||
option = 1;
|
||||
}
|
||||
@@ -375,7 +379,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (; (i + 63) < n; i += 64 )
|
||||
for (; (i + 63) < n0; i += 64 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -440,7 +444,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
x0 += 16*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 47) < n; i += 48 )
|
||||
for (; (i + 47) < n0; i += 48 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -492,7 +496,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
|
||||
case 1:
|
||||
|
||||
for (; (i + 31) < n; i += 32 )
|
||||
for (; (i + 31) < n0; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -529,7 +533,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
|
||||
case 2:
|
||||
|
||||
for ( ; (i + 11) < n; i += 12 )
|
||||
for ( ; (i + 11) < n0; i += 12 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -546,7 +550,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
x0 += 3*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 3) < n; i += 4 )
|
||||
for ( ; (i + 3) < n0; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
@@ -557,7 +561,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
x0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
for ( ; (i + 0) < n0; i += 1 )
|
||||
{
|
||||
*x0 *= *alpha;
|
||||
|
||||
@@ -569,7 +573,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int10
|
||||
{
|
||||
const double alphac = *alpha;
|
||||
|
||||
for ( ; i < n; ++i )
|
||||
for ( ; i < n0; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
@@ -587,6 +591,30 @@ void bli_zdscalv_zen_int10
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha )) return;
|
||||
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
dcomplex *zero = bli_z0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
const dim_t n_elem_per_reg = 4; // number of elements per register
|
||||
|
||||
@@ -607,7 +635,7 @@ void bli_zdscalv_zen_int10
|
||||
|
||||
alphav = _mm256_broadcast_sd( &alphac );
|
||||
|
||||
for ( ; ( i + 29 ) < n; i += 30 )
|
||||
for ( ; ( i + 29 ) < n0; i += 30 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
|
||||
@@ -660,7 +688,7 @@ void bli_zdscalv_zen_int10
|
||||
x0 += 15 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 23 ) < n; i += 24 )
|
||||
for ( ; ( i + 23 ) < n0; i += 24 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
|
||||
@@ -704,7 +732,7 @@ void bli_zdscalv_zen_int10
|
||||
x0 += 12 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 15 ) < n; i += 16 )
|
||||
for ( ; ( i + 15 ) < n0; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
|
||||
@@ -736,7 +764,7 @@ void bli_zdscalv_zen_int10
|
||||
x0 += 8 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 7 ) < n; i += 8 )
|
||||
for ( ; ( i + 7 ) < n0; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
|
||||
@@ -756,7 +784,7 @@ void bli_zdscalv_zen_int10
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 3 ) < n; i += 4 )
|
||||
for ( ; ( i + 3 ) < n0; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
xv[1] = _mm256_loadu_pd( x0 + n_elem_per_reg );
|
||||
@@ -770,7 +798,7 @@ void bli_zdscalv_zen_int10
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 1 ) < n; i += 2 )
|
||||
for ( ; ( i + 1 ) < n0; i += 2 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 );
|
||||
|
||||
@@ -795,7 +823,7 @@ void bli_zdscalv_zen_int10
|
||||
|
||||
alpha_reg = _mm_set1_pd((*alpha).real);
|
||||
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
x_vec = _mm_loadu_pd(x0);
|
||||
|
||||
@@ -816,24 +844,14 @@ void bli_cscalv_zen_int
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/*
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
|
||||
|
||||
1. This layer is not BLAS complaint and the kernel results in
|
||||
undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that the application/higher-layer invoking this layer should
|
||||
the arg checks.
|
||||
*/
|
||||
// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
|
||||
// return;
|
||||
|
||||
// To Do: This call to SETV needs to be removed for BLAS compliance
|
||||
// Currently removing this is resulting in ZHERK failures
|
||||
if (PASTEMAC(c, eq0)(*alpha))
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
scomplex *zero = PASTEMAC(c, 0);
|
||||
scomplex *zero = bli_c0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
|
||||
@@ -848,6 +866,8 @@ void bli_cscalv_zen_int
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
scomplex alpha_conj;
|
||||
float *x0 = (float *)x;
|
||||
@@ -897,7 +917,7 @@ void bli_cscalv_zen_int
|
||||
and then store
|
||||
*/
|
||||
|
||||
for (; (i + 15) < n; i += 16)
|
||||
for (; (i + 15) < n0; i += 16)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_ps(x0);
|
||||
x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
|
||||
@@ -927,7 +947,7 @@ void bli_cscalv_zen_int
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_ps(x0);
|
||||
x_vec_ymm[1] = _mm256_loadu_ps(x0 + n_elem_per_reg);
|
||||
@@ -947,7 +967,7 @@ void bli_cscalv_zen_int
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_ps(x0);
|
||||
|
||||
@@ -969,7 +989,7 @@ void bli_cscalv_zen_int
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
for (; i < n; i++)
|
||||
for (; i < n0; i++)
|
||||
{
|
||||
float x_real, x_imag;
|
||||
x_real = real * (*x0) - imag * (*(x0 + 1));
|
||||
@@ -991,24 +1011,14 @@ void bli_zscalv_zen_int
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/*
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
|
||||
|
||||
1. This layer is not BLAS complaint and the kernel results in
|
||||
undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that the application/higher-layer invoking this layer should
|
||||
the arg checks.
|
||||
*/
|
||||
// if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
|
||||
// return;
|
||||
|
||||
// To Do: This call to SETV needs to be removed for BLAS compliance
|
||||
// Currently removing this is resulting in ZHERK failures
|
||||
if (PASTEMAC(z, eq0)(*alpha))
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
dcomplex *zero = PASTEMAC(z, 0);
|
||||
dcomplex *zero = bli_z0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
|
||||
@@ -1023,6 +1033,8 @@ void bli_zscalv_zen_int
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
dcomplex alpha_conj;
|
||||
double *x0 = (double *)x;
|
||||
@@ -1033,8 +1045,8 @@ void bli_zscalv_zen_int
|
||||
double real = alpha_conj.real;
|
||||
double imag = alpha_conj.imag;
|
||||
|
||||
/*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/
|
||||
if (incx == 1 && n >= 2)
|
||||
/*When incx is 1 and n0 >= 2 it is possible to use AVX2 instructions*/
|
||||
if (incx == 1 && n0 >= 2)
|
||||
{
|
||||
dim_t const n_elem_per_reg = 4;
|
||||
|
||||
@@ -1072,7 +1084,7 @@ void bli_zscalv_zen_int
|
||||
and then store
|
||||
*/
|
||||
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_pd(x0);
|
||||
x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
|
||||
@@ -1106,7 +1118,7 @@ void bli_zscalv_zen_int
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_pd(x0);
|
||||
x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
|
||||
@@ -1126,7 +1138,7 @@ void bli_zscalv_zen_int
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 1) < n; i += 2)
|
||||
for (; (i + 1) < n0; i += 2)
|
||||
{
|
||||
x_vec_ymm[0] = _mm256_loadu_pd(x0);
|
||||
|
||||
@@ -1155,7 +1167,7 @@ void bli_zscalv_zen_int
|
||||
alpha_real_xmm = _mm_set1_pd(real);
|
||||
alpha_imag_xmm = _mm_set1_pd(imag);
|
||||
|
||||
for (; i < n; i++)
|
||||
for (; i < n0; i++)
|
||||
{
|
||||
x_vec_xmm = _mm_loadu_pd(x0);
|
||||
|
||||
|
||||
@@ -61,13 +61,14 @@
|
||||
Deviation from BLAS
|
||||
--------------------
|
||||
|
||||
None
|
||||
Setv is used when alpha=0 unless a negative value of n is supplied.
|
||||
This only occurs in calls from BLAS and CBLAS scal APIs.
|
||||
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
|
||||
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that these are standard BLAS exceptions and should be handled in a higher layer.
|
||||
None
|
||||
|
||||
*/
|
||||
void bli_sscalv_zen_int_avx512
|
||||
(
|
||||
@@ -78,6 +79,30 @@ void bli_sscalv_zen_int_avx512
|
||||
cntx_t *restrict cntx
|
||||
)
|
||||
{
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
float *zero = bli_s0;
|
||||
if (cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SETV_KER, cntx);
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
float *restrict x0 = x;
|
||||
|
||||
@@ -89,7 +114,7 @@ void bli_sscalv_zen_int_avx512
|
||||
__m512 xv[8], alphav;
|
||||
alphav = _mm512_set1_ps(*alpha);
|
||||
|
||||
for (i = 0; (i + 127) < n; i += 128)
|
||||
for (i = 0; (i + 127) < n0; i += 128)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
|
||||
@@ -125,7 +150,7 @@ void bli_sscalv_zen_int_avx512
|
||||
x0 += 8 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 63) < n; i += 64)
|
||||
for (; (i + 63) < n0; i += 64)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
|
||||
@@ -147,7 +172,7 @@ void bli_sscalv_zen_int_avx512
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 31) < n; i += 32)
|
||||
for (; (i + 31) < n0; i += 32)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
|
||||
@@ -163,7 +188,7 @@ void bli_sscalv_zen_int_avx512
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 15) < n; i += 16)
|
||||
for (; (i + 15) < n0; i += 16)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
|
||||
@@ -176,7 +201,7 @@ void bli_sscalv_zen_int_avx512
|
||||
x0 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
// Loading the input values
|
||||
__m256 x_vec = _mm256_loadu_ps(x0);
|
||||
@@ -198,7 +223,7 @@ void bli_sscalv_zen_int_avx512
|
||||
*/
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
// Loading the input values
|
||||
__m128 x_vec = _mm_loadu_ps(x0);
|
||||
@@ -215,7 +240,7 @@ void bli_sscalv_zen_int_avx512
|
||||
|
||||
const float alphac = *alpha;
|
||||
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
@@ -252,13 +277,14 @@ void bli_sscalv_zen_int_avx512
|
||||
Deviation from BLAS
|
||||
--------------------
|
||||
|
||||
None
|
||||
Setv is used when alpha=0 unless a negative value of n is supplied.
|
||||
This only occurs in calls from BLAS and CBLAS scal APIs.
|
||||
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
|
||||
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that these are standard BLAS exceptions and should be handled in a higher layer.
|
||||
None
|
||||
|
||||
*/
|
||||
BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
(
|
||||
@@ -270,11 +296,10 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
)
|
||||
{
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if (bli_zero_dim1(n) || PASTEMAC(d, eq1)(*alpha))
|
||||
return;
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv.
|
||||
if (PASTEMAC(d, eq0)(*alpha))
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
double *zero = bli_d0;
|
||||
if (cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
@@ -292,6 +317,8 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
double *restrict x0;
|
||||
|
||||
@@ -307,7 +334,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
alphav = _mm512_set1_pd(*alpha);
|
||||
__m512d xv[8];
|
||||
|
||||
for (i = 0; (i + 63) < n; i += 64)
|
||||
for (i = 0; (i + 63) < n0; i += 64)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
|
||||
@@ -343,7 +370,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
x0 += 8 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 31) < n; i += 32)
|
||||
for (; (i + 31) < n0; i += 32)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
|
||||
@@ -365,7 +392,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 15) < n; i += 16)
|
||||
for (; (i + 15) < n0; i += 16)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
|
||||
@@ -381,7 +408,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
// Loading the input values
|
||||
xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
|
||||
@@ -394,7 +421,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
x0 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
// Loading the input values
|
||||
__m256d x_vec = _mm256_loadu_pd(x0);
|
||||
@@ -416,7 +443,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
*/
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (; (i + 1) < n; i += 2)
|
||||
for (; (i + 1) < n0; i += 2)
|
||||
{
|
||||
// Loading the input values
|
||||
__m128d x_vec = _mm_loadu_pd(x0);
|
||||
@@ -433,7 +460,7 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
|
||||
const double alphac = *alpha;
|
||||
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
@@ -468,13 +495,14 @@ BLIS_EXPORT_BLIS void bli_dscalv_zen_int_avx512
|
||||
Deviation from BLAS
|
||||
--------------------
|
||||
|
||||
None
|
||||
Setv is used when alpha=0 unless a negative value of n is supplied.
|
||||
This only occurs in calls from BLAS and CBLAS scal APIs.
|
||||
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
|
||||
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that these are standard BLAS exceptions and should be handled in a higher layer.
|
||||
None
|
||||
|
||||
*/
|
||||
void bli_zdscalv_zen_int_avx512
|
||||
(
|
||||
@@ -491,6 +519,31 @@ void bli_zdscalv_zen_int_avx512
|
||||
alpha is passed as double complex to adhere
|
||||
to function pointer definition in BLIS
|
||||
*/
|
||||
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(z,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
dcomplex *zero = bli_z0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx,
|
||||
NULL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
const double alphac = (*alpha).real;
|
||||
|
||||
dim_t i = 0;
|
||||
@@ -504,7 +557,7 @@ void bli_zdscalv_zen_int_avx512
|
||||
|
||||
alphav = _mm512_set1_pd(alphac);
|
||||
|
||||
for (; (i + 15) < n; i += 16)
|
||||
for (; (i + 15) < n0; i += 16)
|
||||
{
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
|
||||
@@ -524,7 +577,7 @@ void bli_zdscalv_zen_int_avx512
|
||||
x0 += 4 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
xv[1] = _mm512_loadu_pd(x0 + n_elem_per_reg);
|
||||
@@ -538,7 +591,7 @@ void bli_zdscalv_zen_int_avx512
|
||||
x0 += 2 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
|
||||
@@ -549,7 +602,7 @@ void bli_zdscalv_zen_int_avx512
|
||||
x0 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
for (; (i + 1) < n; i += 2)
|
||||
for (; (i + 1) < n0; i += 2)
|
||||
{
|
||||
__m256d xv = _mm256_loadu_pd(x0);
|
||||
|
||||
@@ -576,7 +629,7 @@ void bli_zdscalv_zen_int_avx512
|
||||
|
||||
alpha_reg = _mm_set1_pd((*alpha).real);
|
||||
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
x_vec = _mm_loadu_pd(x0);
|
||||
|
||||
@@ -674,8 +727,8 @@ void bli_zdscalv_zen_int_avx512
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
|
||||
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that these are standard BLAS exceptions and should be handled in a higher layer.
|
||||
None
|
||||
|
||||
*/
|
||||
void bli_cscalv_zen_int_avx512
|
||||
(
|
||||
@@ -689,14 +742,11 @@ void bli_cscalv_zen_int_avx512
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(c,eq1)( *alpha ) ) return;
|
||||
|
||||
/**
|
||||
* @note Currently this kernel is not BLAS compliant. For BLAS compliance,
|
||||
* the below call to SETV needs to be removed.
|
||||
*/
|
||||
if ( PASTEMAC(c,eq0)(*alpha) )
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if ( PASTEMAC(c,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
scomplex *zero = PASTEMAC(c,0);
|
||||
scomplex *zero = bli_c0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(c, setv, BLIS_TAPI_EX_SUF)
|
||||
@@ -712,6 +762,8 @@ void bli_cscalv_zen_int_avx512
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
scomplex alpha_conj;
|
||||
float* restrict x0 = (float*) x;
|
||||
@@ -760,7 +812,7 @@ void bli_cscalv_zen_int_avx512
|
||||
*/
|
||||
|
||||
// Processing 96 scomplex elements (192 floats) per iteration
|
||||
for ( ; (i + 95) < n; i += 96 )
|
||||
for ( ; (i + 95) < n0; i += 96 )
|
||||
{
|
||||
__m512 xv[12], inter[12];
|
||||
|
||||
@@ -776,7 +828,7 @@ void bli_cscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 64 scomplex elements (128 floats) per iteration
|
||||
for ( ; (i + 63) < n; i += 64 )
|
||||
for ( ; (i + 63) < n0; i += 64 )
|
||||
{
|
||||
__m512 xv[8], inter[8];
|
||||
|
||||
@@ -790,7 +842,7 @@ void bli_cscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 32 scomplex elements (64 floats) per iteration
|
||||
for ( ; (i + 31) < n; i += 32 )
|
||||
for ( ; (i + 31) < n0; i += 32 )
|
||||
{
|
||||
__m512 xv[4], inter[4];
|
||||
|
||||
@@ -802,7 +854,7 @@ void bli_cscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 16 scomplex elements (32 floats) per iteration
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
for ( ; (i + 15) < n0; i += 16 )
|
||||
{
|
||||
__m512 xv[2], inter[2];
|
||||
|
||||
@@ -842,7 +894,7 @@ void bli_cscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 8 scomplex elements (16 floats) per iteration
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
for ( ; (i + 7) < n0; i += 8 )
|
||||
{
|
||||
__m512 xv[1], inter[1];
|
||||
|
||||
@@ -877,21 +929,23 @@ void bli_cscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing remaining elements, if any.
|
||||
if ( i < n ) {
|
||||
if ( i < n0 )
|
||||
{
|
||||
// Setting the mask bit based on remaining elements.
|
||||
// Since each scomplex element corresponds to 2 floats,
|
||||
// we need to load and store 2*(n-i) elements.
|
||||
// we need to load and store 2*(n0-i) elements.
|
||||
|
||||
__mmask16 mask = ( 1 << ( 2 * ( n - i ) ) ) - 1;
|
||||
__mmask16 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
|
||||
|
||||
__m512 xv, temp;
|
||||
|
||||
__m512 xv, inter;
|
||||
xv = _mm512_maskz_loadu_ps( mask, x0 );
|
||||
|
||||
inter = _mm512_permute_ps( xv, 0xB1 );
|
||||
temp = _mm512_permute_ps( xv, 0xB1 );
|
||||
|
||||
inter = _mm512_mul_ps( alphaIv, inter );
|
||||
temp = _mm512_mul_ps( alphaIv, temp );
|
||||
|
||||
xv = _mm512_fmaddsub_ps( alphaRv, xv, inter );
|
||||
xv = _mm512_fmaddsub_ps( alphaRv, xv, temp );
|
||||
|
||||
_mm512_mask_storeu_ps( x0, mask, xv );
|
||||
}
|
||||
@@ -902,7 +956,7 @@ void bli_cscalv_zen_int_avx512
|
||||
const float alphaI = alpha_conj.imag;
|
||||
|
||||
float x0R, x0I;
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
x0R = *(x0);
|
||||
x0I = *(x0 + 1);
|
||||
@@ -942,13 +996,14 @@ void bli_cscalv_zen_int_avx512
|
||||
Deviation from BLAS
|
||||
--------------------
|
||||
|
||||
None
|
||||
Setv is used when alpha=0 unless a negative value of n is supplied.
|
||||
This only occurs in calls from BLAS and CBLAS scal APIs.
|
||||
|
||||
Undefined behaviour
|
||||
-------------------
|
||||
|
||||
1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
|
||||
is that these are standard BLAS exceptions and should be handled in a higher layer.
|
||||
None
|
||||
|
||||
*/
|
||||
void bli_zscalv_zen_int_avx512
|
||||
(
|
||||
@@ -960,17 +1015,13 @@ void bli_zscalv_zen_int_avx512
|
||||
)
|
||||
{
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
|
||||
return;
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(z,eq1)( *alpha ) ) return;
|
||||
|
||||
/**
|
||||
* @note Currently this kernel is not BLAS compliant. For BLAS compliance,
|
||||
* the below call to SETV needs to be removed.
|
||||
*/
|
||||
if (PASTEMAC(z, eq0)(*alpha))
|
||||
// If alpha is zero, use setv if not called from BLAS scal itself (indicated by n being negative).
|
||||
if (PASTEMAC(z,eq0)( *alpha ) && n > 0 )
|
||||
{
|
||||
// Expert interface of setv is invoked when alpha is zero
|
||||
dcomplex *zero = PASTEMAC(z, 0);
|
||||
dcomplex *zero = bli_z0;
|
||||
|
||||
/* When alpha is zero all the element in x are set to zero */
|
||||
PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
|
||||
@@ -985,6 +1036,8 @@ void bli_zscalv_zen_int_avx512
|
||||
return;
|
||||
}
|
||||
|
||||
dim_t n0 = bli_abs(n);
|
||||
|
||||
dim_t i = 0;
|
||||
dcomplex alpha_conj;
|
||||
double *restrict x0 = (double *)x;
|
||||
@@ -1022,7 +1075,7 @@ void bli_zscalv_zen_int_avx512
|
||||
*/
|
||||
|
||||
// Processing 48 dcomplex elements per iteration.
|
||||
for (; (i + 47) < n; i += 48)
|
||||
for (; (i + 47) < n0; i += 48)
|
||||
{
|
||||
__m512d xv[12], temp[12];
|
||||
|
||||
@@ -1116,7 +1169,7 @@ void bli_zscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 32 dcomplex elements per iteration.
|
||||
for (; (i + 31) < n; i += 32)
|
||||
for (; (i + 31) < n0; i += 32)
|
||||
{
|
||||
__m512d xv[8], temp[8];
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
@@ -1173,7 +1226,7 @@ void bli_zscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 16 dcomplex elements per iteration.
|
||||
for (; (i + 15) < n; i += 16)
|
||||
for (; (i + 15) < n0; i += 16)
|
||||
{
|
||||
__m512d xv[4], temp[4];
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
@@ -1205,7 +1258,7 @@ void bli_zscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 8 dcomplex elements per iteration.
|
||||
for (; (i + 7) < n; i += 8)
|
||||
for (; (i + 7) < n0; i += 8)
|
||||
{
|
||||
__m512d xv[2], temp[2];
|
||||
xv[0] = _mm512_loadu_pd(x0);
|
||||
@@ -1227,7 +1280,7 @@ void bli_zscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing 4 dcomplex elements per iteration.
|
||||
for (; (i + 3) < n; i += 4)
|
||||
for (; (i + 3) < n0; i += 4)
|
||||
{
|
||||
__m512d xv, temp;
|
||||
xv = _mm512_loadu_pd(x0);
|
||||
@@ -1244,23 +1297,24 @@ void bli_zscalv_zen_int_avx512
|
||||
}
|
||||
|
||||
// Processing the remainder elements.
|
||||
if( i < n )
|
||||
if( i < n0 )
|
||||
{
|
||||
// Setting the mask bit based on remaining elements
|
||||
// Since each dcomplex elements corresponds to 2 doubles
|
||||
// we need to load and store 2*(m-i) elements.
|
||||
__mmask8 mask = (1 << (2 * (n-i)) ) - 1;
|
||||
// we need to load and store 2*(n0-i) elements.
|
||||
|
||||
__mmask8 mask = ( 1 << ( 2 * ( n0 - i ) ) ) - 1;
|
||||
|
||||
__m512d xv, temp, zero;
|
||||
zero = _mm512_setzero_pd();
|
||||
|
||||
xv = _mm512_mask_loadu_pd( zero, mask, x0 );
|
||||
|
||||
temp = _mm512_permute_pd(xv, 0x55);
|
||||
temp = _mm512_permute_pd( xv, 0x55 );
|
||||
|
||||
temp = _mm512_mul_pd(alphaIv, temp);
|
||||
temp = _mm512_mul_pd( alphaIv, temp );
|
||||
|
||||
xv = _mm512_fmaddsub_pd(alphaRv, xv, temp);
|
||||
xv = _mm512_fmaddsub_pd( alphaRv, xv, temp );
|
||||
|
||||
_mm512_mask_storeu_pd( x0, mask, xv );
|
||||
}
|
||||
@@ -1272,7 +1326,7 @@ void bli_zscalv_zen_int_avx512
|
||||
alphaRv = _mm_loaddup_pd(&alphaR);
|
||||
alphaIv = _mm_loaddup_pd(&alphaI);
|
||||
|
||||
for (; i < n; ++i)
|
||||
for (; i < n0; ++i)
|
||||
{
|
||||
x_vec = _mm_loadu_pd(x0);
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -52,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \
|
||||
\
|
||||
/* If alpha is zero, use setv. */ \
|
||||
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
|
||||
if ( PASTEMAC(ch,eq0)( *alpha ) && n > 0) \
|
||||
{ \
|
||||
ctype* zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
@@ -70,6 +71,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
dim_t n0 = bli_abs(n); \
|
||||
\
|
||||
ctype alpha_conj; \
|
||||
\
|
||||
@@ -78,14 +81,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
if ( incx == 1 ) \
|
||||
{ \
|
||||
PRAGMA_SIMD \
|
||||
for ( dim_t i = 0; i < n; ++i ) \
|
||||
for ( dim_t i = 0; i < n0; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scals)( alpha_conj, x[i] ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( dim_t i = 0; i < n; ++i ) \
|
||||
for ( dim_t i = 0; i < n0; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scals)( alpha_conj, *x ); \
|
||||
\
|
||||
|
||||
Reference in New Issue
Block a user