Optimized DGEMV kernel and changed BLAS interface call

Details:
- Optimized daxpyf kernel with fuse_factor=5 and iter_unroll=2.
- Modified framework files of dgemv to remove dependency on cntx variable.
- Updated cntx_init file of zen2 to choose optimized kernels.
- Modified BLAS interface call for DGEMV to reduce framework overhread.
- Currently these changes are applicable for zen2 configuration.
  They will be enabled for zen family processors in future.
- Changed naming convention for new BLAS macros to indicate their use.
- Added new optimized kernel for axpyf under zen2 folder.
- Implemented basic GEMV kernel without using axpyv or axpyf.
  This kernel is chosen for small sizes.

Change-Id: I4278d37e494854879c71499b8b9da8c5dbe3bf5b
Signed-off-by: Meghana Vankadari <Meghana.Vankadari@amd.com>
AMD-Internal: [CPUPL-885]
This commit is contained in:
Meghana Vankadari
2020-05-14 11:04:05 +05:30
parent af1ad806f2
commit 4fcc4e499d
21 changed files with 1138 additions and 100 deletions

View File

@@ -5,6 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2020, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -153,6 +154,7 @@ get-aocldtl-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
$(call load-var-for,CKVECFLAGS,$(1)) \
$(call get-noopt-cflags-for,$(1)) \
$(COMPSIMDFLAGS) \
$(BUILD_CPPFLAGS) \
$(BUILD_SYMFLAGS) \
)

View File

@@ -69,7 +69,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -128,7 +128,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache

50
frame/2/bli_l2_ker.h Normal file
View File

@@ -0,0 +1,50 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-2 kernels.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l2_ker_prot.h.
#undef GENTPROT
#define GENTPROT GEMV_KER_PROT
INSERT_GENTPROT_BASIC0( gemv_ker_name )

54
frame/2/bli_l2_ker_prot.h Normal file
View File

@@ -0,0 +1,54 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1f kernels.
//
#define GEMV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t lda, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
cntx_t* cntx \
) \
{ \
bli_init_once(); \
\
if( cntx == NULL) cntx = bli_gks_query_cntx(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* A1; \
@@ -100,5 +105,72 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
#ifdef BLIS_CONFIG_ZEN2
void bli_dgemv_unf_var1
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
double* A1;
double* x1;
double* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_init_once();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
bli_ddotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
}
INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var1 )
#else
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
cntx_t* cntx \
) \
{ \
bli_init_once(); \
\
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
@@ -127,5 +132,79 @@ void PASTEMAC(ch,varname) \
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
#ifdef BLIS_CONFIG_ZEN2
void bli_dgemv_unf_var2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
double* A1;
double* x1;
double* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 5;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_daxpyf_zen_int_5
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
NULL
);
}
}
INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var2 )
#else
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -48,16 +48,11 @@ void bli_trsm_front
)
{
bli_init_once();
int i, j;
obj_t a_local;
obj_t b_local;
obj_t c_local;
//int m = bli_obj_length(*b);
//int n = bli_obj_width(*b);
//float *L = a->buffer;
// float *B = b->buffer;
#ifdef PRINT_SMALL_TRSM_INFO
printf("Side:: %c\n", side ? 'R' : 'L');
if (bli_obj_datatype(*a) == BLIS_FLOAT)

View File

@@ -238,7 +238,7 @@ f77_int idamax_
return f77_index;
}
INSERT_GENTFUNC_BLAS_ZEN2( amax, amaxv )
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
#else
INSERT_GENTFUNC_BLAS( amax, amaxv )
#endif

View File

@@ -224,7 +224,7 @@ void daxpy_
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_ZEN2( axpy, axpyv )
INSERT_GENTFUNC_BLAS_CZ( axpy, axpyv )
#else
INSERT_GENTFUNC_BLAS( axpy, axpyv )

View File

@@ -236,7 +236,7 @@ void dcopy_
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
#else
INSERT_GENTFUNC_BLAS(copy, copyv)
#endif

View File

@@ -247,7 +247,7 @@ double ddot_
return rho;
}
INSERT_GENTFUNCDOT_BLAS_ZEN2( dot, dotv )
INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
#else
INSERT_GENTFUNCDOT_BLAS( dot, dotv )
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -53,90 +54,396 @@ void PASTEF77(ch,blasname) \
ftype* y, const f77_int* incy \
) \
{ \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
inc_t rs_a, cs_a; \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
inc_t rs_a, cs_a; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
m, \
n, \
lda, \
incx, \
incy \
); \
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
m, \
n, \
lda, \
incx, \
incy \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
\
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */ \
if ( m_y > 0 && n_x == 0 ) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */ \
if ( m_y > 0 && n_x == 0 ) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
return; \
} \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
rs_a = 1; \
cs_a = *lda; \
/* Set the row and column strides of A. */ \
rs_a = 1; \
cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
BLIS_NO_CONJUGATE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
BLIS_NO_CONJUGATE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
#ifdef BLIS_CONFIG_ZEN2
void dgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const double* alpha,
const double* a, const f77_int* lda,
const double* x, const f77_int* incx,
const double* beta,
double* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(d),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((double*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((double*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((double*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((double*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
if((m0 <1200) && (n0 < 1200))
{
bli_dgemv_zen_basic
(
m0,
n0,
(double*) alpha,
(double*)a, cs_a,
x0, incx0,
(double*) beta,
y0, incy0,
NULL
);
}
else
{
//variant_2 is chosen for column-storage
// and uses axpyf-based implementation
bli_dgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
}
else
{
//var_1 is chosen for row-storage
//and uses dotxf-based implementation
bli_dgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(double*)alpha,
(double*)a, rs_a, cs_a,
x0, incx0,
(double*)beta,
y0, incy0,
NULL
);
}
}
void sgemv_
(
const f77_char* transa,
const f77_int* m,
const f77_int* n,
const float* alpha,
const float* a, const f77_int* lda,
const float* x, const f77_int* incx,
const float* beta,
float* y, const f77_int* incy
)
{
trans_t blis_transa;
dim_t m0, n0;
dim_t m_y, n_x;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
inc_t rs_a, cs_a;
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemv)
(
MKSTR(s),
MKSTR(gemv),
transa,
m,
n,
lda,
incx,
incy
);
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
blis_transa = BLIS_NO_TRANSPOSE;
}
/* Convert/typecast negative values of m and n to zero. */
if ( *m < 0 ) m0 = ( dim_t )0;
else m0 = ( dim_t )(*m);
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/
if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
else { m_y = n0; n_x = m0; }
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
in a peculiar way. In these situations, BLAS returns without performing
any action, even though most sane interpretations of gemv would have the
the operation reduce to y := beta * y. Here, we catch those cases that
BLAS would normally mishandle and emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be as much of an issue if it weren't for the
fact that some BLAS test suites actually test for these cases. Also, it
should be emphasized that BLIS, if called natively, does NOT exhibit
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
/* Finalize BLIS. */
// bli_finalize_auto();
return;
}
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
x0 = ((float*)x) + (n_x-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((float*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((float*)y) + (m_y-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((float*)y);
incy0 = ( inc_t )(*incy);
}
/* Set the row and column strides of A. */
rs_a = 1;
cs_a = *lda;
/* Call variants based on transpose value. */
if(bli_does_notrans(blis_transa))
{
bli_sgemv_unf_var2
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
else
{
bli_sgemv_unf_var1
(
blis_transa,
BLIS_NO_CONJUGATE,
m0,
n0,
(float*)alpha,
(float*)a, rs_a, cs_a,
x0, incx0,
(float*)beta,
y0, incy0,
NULL
);
}
}
INSERT_GENTFUNC_BLAS_CZ( gemv, gemv )
#else
INSERT_GENTFUNC_BLAS( gemv, gemv )
#endif
#endif

View File

@@ -203,7 +203,7 @@ void dscal_
// bli_finalize_auto();
}
INSERT_GENTFUNCSCAL_BLAS_ZEN2( scal, scalv )
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
#else
INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
#endif

View File

@@ -229,7 +229,7 @@ void dswap_
// bli_finalize_auto();
}
INSERT_GENTFUNC_BLAS_ZEN2( swap, swapv )
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
#else
INSERT_GENTFUNC_BLAS( swap, swapv )

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -244,9 +244,9 @@ CNTX_INIT_PROTS( generic )
// -- AMD64 architectures --
//#ifdef BLIS_KERNELS_ZEN2
//#include "bli_kernels_zen2.h"
//#endif
#ifdef BLIS_KERNELS_ZEN2
#include "bli_kernels_zen2.h"
#endif
#ifdef BLIS_KERNELS_ZEN
#include "bli_kernels_zen.h"
#endif

View File

@@ -57,7 +57,7 @@ GENTFUNC( scomplex, c, blasname, blisname ) \
GENTFUNC( dcomplex, z, blasname, blisname )
#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
#define INSERT_GENTFUNC_BLAS_CZ( blasname, blisname ) \
\
GENTFUNC( scomplex, c, blasname, blisname ) \
GENTFUNC( dcomplex, z, blasname, blisname )
@@ -82,7 +82,7 @@ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname )
// -- Basic one-operand macro with conjugation (used only for dot, ger) --
#define INSERT_GENTFUNCDOT_BLAS_ZEN2( blasname, blisname ) \
#define INSERT_GENTFUNCDOT_BLAS_CZ( blasname, blisname ) \
\
GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \
GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \
@@ -125,7 +125,7 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
// -- Extended two-operand macro (used only for scal) --
#define INSERT_GENTFUNCSCAL_BLAS_ZEN2( blasname, blisname ) \
#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \
\
GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \
GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \
@@ -157,6 +157,14 @@ GENTFUNC( double, d, tfuncname ) \
GENTFUNC( scomplex, c, tfuncname ) \
GENTFUNC( dcomplex, z, tfuncname )
#define INSERT_GENTFUNC_BASIC0_SCZ( tfuncname ) \
\
GENTFUNC( float, s, tfuncname ) \
GENTFUNC( scomplex, c, tfuncname ) \
GENTFUNC( dcomplex, z, tfuncname )
// -- (one auxiliary argument) --
#define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -87,6 +87,7 @@ extern "C" {
#include "bli_l1v_ker_prot.h"
#include "bli_l1f_ker_prot.h"
#include "bli_l1m_ker_prot.h"
#include "bli_l2_ker_prot.h"
#include "bli_l3_ukr_prot.h"
#include "bli_l3_sup_ker_prot.h"

View File

@@ -1,4 +0,0 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore

View File

@@ -0,0 +1,326 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
/* Union data structure to access AVX registers
One 256-bit AVX register holds 8 SP elements. */
typedef union
{
__m256 v;
float f[8] __attribute__((aligned(64)));
} v8sf_t;
/* Union data structure to access AVX registers
* One 256-bit AVX register holds 4 DP elements. */
typedef union
{
__m256d v;
double d[4] __attribute__((aligned(64)));
} v4df_t;
// -----------------------------------------------------------------------------
void bli_daxpyf_zen_int_5
(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t fuse_fac = 5;
const dim_t n_elem_per_reg = 4;
const dim_t n_iter_unroll = 2;
dim_t i;
double* restrict a0;
double* restrict a1;
double* restrict a2;
double* restrict a3;
double* restrict a4;
double* restrict y0;
v4df_t chi0v, chi1v, chi2v, chi3v;
v4df_t chi4v;
v4df_t a00v, a01v, a02v, a03v;
v4df_t a04v;
v4df_t a10v, a11v, a12v, a13v;
v4df_t a14v;
v4df_t y0v, y1v;
double chi0, chi1, chi2, chi3;
double chi4;
// If either dimension is zero, or if alpha is zero, return early.
if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
// If b_n is not equal to the fusing factor, then perform the entire
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_ZEN2
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
PASTEMAC(d,scals)( *alpha, alpha_chi1 );
bli_daxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
PASTEMAC(d,scals)( *alpha, alpha_chi1 );
f
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#endif
return;
}
// At this point, we know that b_n is exactly equal to the fusing factor.
a0 = a + 0*lda;
a1 = a + 1*lda;
a2 = a + 2*lda;
a3 = a + 3*lda;
a4 = a + 4*lda;
y0 = y;
chi0 = *( x + 0*incx );
chi1 = *( x + 1*incx );
chi2 = *( x + 2*incx );
chi3 = *( x + 3*incx );
chi4 = *( x + 4*incx );
// Scale each chi scalar by alpha.
PASTEMAC(d,scals)( *alpha, chi0 );
PASTEMAC(d,scals)( *alpha, chi1 );
PASTEMAC(d,scals)( *alpha, chi2 );
PASTEMAC(d,scals)( *alpha, chi3 );
PASTEMAC(d,scals)( *alpha, chi4 );
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
chi0v.v = _mm256_broadcast_sd( &chi0 );
chi1v.v = _mm256_broadcast_sd( &chi1 );
chi2v.v = _mm256_broadcast_sd( &chi2 );
chi3v.v = _mm256_broadcast_sd( &chi3 );
chi4v.v = _mm256_broadcast_sd( &chi4 );
// If there are vectorized iterations, perform them with vector
// instructions.
if ( inca == 1 && incy == 1 )
{
for ( i = 0; (i + 7) < m; i += 8 )
{
// Load the input values.
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
// Store the output.
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
y0 += n_iter_unroll * n_elem_per_reg;
a0 += n_iter_unroll * n_elem_per_reg;
a1 += n_iter_unroll * n_elem_per_reg;
a2 += n_iter_unroll * n_elem_per_reg;
a3 += n_iter_unroll * n_elem_per_reg;
a4 += n_iter_unroll * n_elem_per_reg;
}
for( ; (i + 3) < m; i += 4 )
{
// Load the input values.
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
// Store the output.
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
y0 += n_elem_per_reg;
a0 += n_elem_per_reg;
a1 += n_elem_per_reg;
a2 += n_elem_per_reg;
a3 += n_elem_per_reg;
a4 += n_elem_per_reg;
}
// If there are leftover iterations, perform them with scalar code.
for ( ; (i + 0) < m ; ++i )
{
double y0c = *y0;
const double a0c = *a0;
const double a1c = *a1;
const double a2c = *a2;
const double a3c = *a3;
const double a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += 1;
a1 += 1;
a2 += 1;
a3 += 1;
a4 += 1;
y0 += 1;
}
}
else
{
for ( i = 0; (i + 0) < m ; ++i )
{
double y0c = *y0;
const double a0c = *a0;
const double a1c = *a1;
const double a2c = *a2;
const double a3c = *a3;
const double a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += inca;
a1 += inca;
a2 += inca;
a3 += inca;
a4 += inca;
y0 += incy;
}
}
}

View File

@@ -0,0 +1,103 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
/* This routine assumes that the matrix is stored in column-major order
* and does not have transpose.
*/
void bli_dgemv_zen_ref_c
(
dim_t m,
dim_t n,
double* restrict alpha,
double* restrict a, inc_t lda,
double* restrict x, inc_t incx,
double* restrict beta,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
double* restrict x0 = x;
double* restrict y0 = y;
double* restrict a0 = a;
dim_t i,j;
if((incy == 1) && (incx == 1))
{
/* n==0 case will be handled in the framework,
where it returns immediately.
* For the first column of A, execute Y = alpha*A*X + beta*Y
*/
double x0_val = (x0[0]);
PRAGMA_SIMD
for(i = 0; i < m; i++)
{
(y0[i]) = (a0[i]) * (x0_val) * (*alpha) + y0[i] * (*beta);
}
a0 += lda;
/* For remaining columns of A, execute, Y = alpha*A*X + Y; */
for(j = 1; j < n; j++)
{
const double xp = (x0[j]);
PRAGMA_SIMD
for(i = 0; i < m; i++)
{
(y0[i]) += (a0[i]) * xp * (*alpha);
}
a0 += lda;
}
}
else
{
for(j = 0; j < m; j++)
PASTEMAC(d,scals)(*beta, *(y0+j*incy))
for(j = 0; j < n; j++)
{
const double xp = *(x0+j*incx);
for(i = 0; i < m; i++)
{
*(y0 + i*incy) += (a0[j*lda+i]) * xp * (*alpha);
}
}
}
return;
}

View File

@@ -0,0 +1,45 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- level-1f --
AXPYF_KER_PROT( double, d, axpyf_zen_int_5 )
// -- level-2 --
//gemv(scalar code)
GEMV_KER_PROT( double, d, gemv_zen_basic )