mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Optimized DGEMV kernel and changed BLAS interface call
Details: - Optimized daxpyf kernel with fuse_factor=5 and iter_unroll=2. - Modified framework files of dgemv to remove dependency on cntx variable. - Updated cntx_init file of zen2 to choose optimized kernels. - Modified BLAS interface call for DGEMV to reduce framework overhread. - Currently these changes are applicable for zen2 configuration. They will be enabled for zen family processors in future. - Changed naming convention for new BLAS macros to indicate their use. - Added new optimized kernel for axpyf under zen2 folder. - Implemented basic GEMV kernel without using axpyv or axpyf. This kernel is chosen for small sizes. Change-Id: I4278d37e494854879c71499b8b9da8c5dbe3bf5b Signed-off-by: Meghana Vankadari <Meghana.Vankadari@amd.com> AMD-Internal: [CPUPL-885]
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -153,6 +154,7 @@ get-aocldtl-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
|
||||
get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \
|
||||
$(call load-var-for,CKVECFLAGS,$(1)) \
|
||||
$(call get-noopt-cflags-for,$(1)) \
|
||||
$(COMPSIMDFLAGS) \
|
||||
$(BUILD_CPPFLAGS) \
|
||||
$(BUILD_SYMFLAGS) \
|
||||
)
|
||||
|
||||
@@ -69,7 +69,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
@@ -128,7 +128,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
#endif
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 5, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
|
||||
50
frame/2/bli_l2_ker.h
Normal file
50
frame/2/bli_l2_ker.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Define template prototypes for level-2 kernels.
|
||||
//
|
||||
|
||||
// Note: Instead of defining function prototype macro templates and then
|
||||
// instantiating those macros to define the individual function prototypes,
|
||||
// we simply alias the official operations' prototypes as defined in
|
||||
// bli_l2_ker_prot.h.
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT GEMV_KER_PROT
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemv_ker_name )
|
||||
|
||||
54
frame/2/bli_l2_ker_prot.h
Normal file
54
frame/2/bli_l2_ker_prot.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Define template prototypes for level-1f kernels.
|
||||
//
|
||||
|
||||
#define GEMV_KER_PROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, inc_t lda, \
|
||||
ctype* restrict x, inc_t incx, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict y, inc_t incy, \
|
||||
cntx_t* restrict cntx \
|
||||
);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
if( cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* A1; \
|
||||
@@ -100,5 +105,72 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
void bli_dgemv_unf_var1
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* A1;
|
||||
double* x1;
|
||||
double* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_init_once();
|
||||
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_ddotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var1 )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* zero = PASTEMAC(ch,0); \
|
||||
@@ -127,5 +132,79 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
|
||||
void bli_dgemv_unf_var2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* A1;
|
||||
double* x1;
|
||||
double* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 5;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_daxpyf_zen_int_5
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var2 )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
|
||||
#endif
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -48,16 +48,11 @@ void bli_trsm_front
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
int i, j;
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
//int m = bli_obj_length(*b);
|
||||
//int n = bli_obj_width(*b);
|
||||
//float *L = a->buffer;
|
||||
// float *B = b->buffer;
|
||||
|
||||
#ifdef PRINT_SMALL_TRSM_INFO
|
||||
printf("Side:: %c\n", side ? 'R' : 'L');
|
||||
if (bli_obj_datatype(*a) == BLIS_FLOAT)
|
||||
|
||||
@@ -238,7 +238,7 @@ f77_int idamax_
|
||||
return f77_index;
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_ZEN2( amax, amaxv )
|
||||
INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( amax, amaxv )
|
||||
#endif
|
||||
|
||||
@@ -224,7 +224,7 @@ void daxpy_
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_ZEN2( axpy, axpyv )
|
||||
INSERT_GENTFUNC_BLAS_CZ( axpy, axpyv )
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( axpy, axpyv )
|
||||
|
||||
@@ -236,7 +236,7 @@ void dcopy_
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv)
|
||||
INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS(copy, copyv)
|
||||
#endif
|
||||
|
||||
@@ -247,7 +247,7 @@ double ddot_
|
||||
return rho;
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCDOT_BLAS_ZEN2( dot, dotv )
|
||||
INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
|
||||
#else
|
||||
INSERT_GENTFUNCDOT_BLAS( dot, dotv )
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -53,90 +54,396 @@ void PASTEF77(ch,blasname) \
|
||||
ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
trans_t blis_transa; \
|
||||
dim_t m0, n0; \
|
||||
dim_t m_y, n_x; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
trans_t blis_transa; \
|
||||
dim_t m0, n0; \
|
||||
dim_t m_y, n_x; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
transa, \
|
||||
m, \
|
||||
n, \
|
||||
lda, \
|
||||
incx, \
|
||||
incy \
|
||||
); \
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
transa, \
|
||||
m, \
|
||||
n, \
|
||||
lda, \
|
||||
incx, \
|
||||
incy \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
\
|
||||
/* Convert/typecast negative values of m and n to zero. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
/* Convert/typecast negative values of m and n to zero. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/ \
|
||||
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/ \
|
||||
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
|
||||
\
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */ \
|
||||
if ( m_y > 0 && n_x == 0 ) \
|
||||
{ \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */ \
|
||||
if ( m_y > 0 && n_x == 0 ) \
|
||||
{ \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Set the row and column strides of A. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
/* Set the row and column strides of A. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_transa, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m0, \
|
||||
n0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
x0, incx0, \
|
||||
(ftype*)beta, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_transa, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
m0, \
|
||||
n0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
x0, incx0, \
|
||||
(ftype*)beta, \
|
||||
y0, incy0, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
void dgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const double* alpha,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* x, const f77_int* incx,
|
||||
const double* beta,
|
||||
double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((double*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((double*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((double*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((double*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
if((m0 <1200) && (n0 < 1200))
|
||||
{
|
||||
bli_dgemv_zen_basic
|
||||
(
|
||||
m0,
|
||||
n0,
|
||||
(double*) alpha,
|
||||
(double*)a, cs_a,
|
||||
x0, incx0,
|
||||
(double*) beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
//variant_2 is chosen for column-storage
|
||||
// and uses axpyf-based implementation
|
||||
bli_dgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//var_1 is chosen for row-storage
|
||||
//and uses dotxf-based implementation
|
||||
bli_dgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(double*)alpha,
|
||||
(double*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(double*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void sgemv_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const float* alpha,
|
||||
const float* a, const f77_int* lda,
|
||||
const float* x, const f77_int* incx,
|
||||
const float* beta,
|
||||
float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
dim_t m0, n0;
|
||||
dim_t m_y, n_x;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
inc_t rs_a, cs_a;
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemv)
|
||||
(
|
||||
MKSTR(s),
|
||||
MKSTR(gemv),
|
||||
transa,
|
||||
m,
|
||||
n,
|
||||
lda,
|
||||
incx,
|
||||
incy
|
||||
);
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE;
|
||||
else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
//bli_check_error_code( BLIS_INVALID_TRANS );
|
||||
blis_transa = BLIS_NO_TRANSPOSE;
|
||||
}
|
||||
|
||||
/* Convert/typecast negative values of m and n to zero. */
|
||||
if ( *m < 0 ) m0 = ( dim_t )0;
|
||||
else m0 = ( dim_t )(*m);
|
||||
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* Determine the dimensions of x and y so we can adjust the increments,
|
||||
if necessary.*/
|
||||
if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; }
|
||||
else { m_y = n0; n_x = m0; }
|
||||
|
||||
/* BLAS handles cases where trans(A) has no columns, and x has no elements,
|
||||
in a peculiar way. In these situations, BLAS returns without performing
|
||||
any action, even though most sane interpretations of gemv would have the
|
||||
the operation reduce to y := beta * y. Here, we catch those cases that
|
||||
BLAS would normally mishandle and emulate the BLAS exactly so as to
|
||||
provide "bug-for-bug" compatibility. Note that this extreme level of
|
||||
compatibility would not be as much of an issue if it weren't for the
|
||||
fact that some BLAS test suites actually test for these cases. Also, it
|
||||
should be emphasized that BLIS, if called natively, does NOT exhibit
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
x0 = ((float*)x) + (n_x-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((float*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((float*)y) + (m_y-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((float*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Set the row and column strides of A. */
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
/* Call variants based on transpose value. */
|
||||
if(bli_does_notrans(blis_transa))
|
||||
{
|
||||
bli_sgemv_unf_var2
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_sgemv_unf_var1
|
||||
(
|
||||
blis_transa,
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
n0,
|
||||
(float*)alpha,
|
||||
(float*)a, rs_a, cs_a,
|
||||
x0, incx0,
|
||||
(float*)beta,
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_CZ( gemv, gemv )
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( gemv, gemv )
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -203,7 +203,7 @@ void dscal_
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCSCAL_BLAS_ZEN2( scal, scalv )
|
||||
INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
|
||||
#else
|
||||
INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
|
||||
#endif
|
||||
|
||||
@@ -229,7 +229,7 @@ void dswap_
|
||||
// bli_finalize_auto();
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BLAS_ZEN2( swap, swapv )
|
||||
INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
|
||||
|
||||
#else
|
||||
INSERT_GENTFUNC_BLAS( swap, swapv )
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -244,9 +244,9 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- AMD64 architectures --
|
||||
|
||||
//#ifdef BLIS_KERNELS_ZEN2
|
||||
//#include "bli_kernels_zen2.h"
|
||||
//#endif
|
||||
#ifdef BLIS_KERNELS_ZEN2
|
||||
#include "bli_kernels_zen2.h"
|
||||
#endif
|
||||
#ifdef BLIS_KERNELS_ZEN
|
||||
#include "bli_kernels_zen.h"
|
||||
#endif
|
||||
|
||||
@@ -57,7 +57,7 @@ GENTFUNC( scomplex, c, blasname, blisname ) \
|
||||
GENTFUNC( dcomplex, z, blasname, blisname )
|
||||
|
||||
|
||||
#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \
|
||||
#define INSERT_GENTFUNC_BLAS_CZ( blasname, blisname ) \
|
||||
\
|
||||
GENTFUNC( scomplex, c, blasname, blisname ) \
|
||||
GENTFUNC( dcomplex, z, blasname, blisname )
|
||||
@@ -82,7 +82,7 @@ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname )
|
||||
|
||||
// -- Basic one-operand macro with conjugation (used only for dot, ger) --
|
||||
|
||||
#define INSERT_GENTFUNCDOT_BLAS_ZEN2( blasname, blisname ) \
|
||||
#define INSERT_GENTFUNCDOT_BLAS_CZ( blasname, blisname ) \
|
||||
\
|
||||
GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \
|
||||
GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \
|
||||
@@ -125,7 +125,7 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
|
||||
// -- Extended two-operand macro (used only for scal) --
|
||||
|
||||
|
||||
#define INSERT_GENTFUNCSCAL_BLAS_ZEN2( blasname, blisname ) \
|
||||
#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \
|
||||
\
|
||||
GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \
|
||||
@@ -157,6 +157,14 @@ GENTFUNC( double, d, tfuncname ) \
|
||||
GENTFUNC( scomplex, c, tfuncname ) \
|
||||
GENTFUNC( dcomplex, z, tfuncname )
|
||||
|
||||
|
||||
#define INSERT_GENTFUNC_BASIC0_SCZ( tfuncname ) \
|
||||
\
|
||||
GENTFUNC( float, s, tfuncname ) \
|
||||
GENTFUNC( scomplex, c, tfuncname ) \
|
||||
GENTFUNC( dcomplex, z, tfuncname )
|
||||
|
||||
|
||||
// -- (one auxiliary argument) --
|
||||
|
||||
#define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -87,6 +87,7 @@ extern "C" {
|
||||
#include "bli_l1v_ker_prot.h"
|
||||
#include "bli_l1f_ker_prot.h"
|
||||
#include "bli_l1m_ker_prot.h"
|
||||
#include "bli_l2_ker_prot.h"
|
||||
#include "bli_l3_ukr_prot.h"
|
||||
#include "bli_l3_sup_ker_prot.h"
|
||||
|
||||
|
||||
4
kernels/zen2/.gitignore
vendored
4
kernels/zen2/.gitignore
vendored
@@ -1,4 +0,0 @@
|
||||
# Ignore everything in this directory
|
||||
*
|
||||
# Except this file
|
||||
!.gitignore
|
||||
326
kernels/zen2/1f/bli_axpyf_zen_int_5.c
Normal file
326
kernels/zen2/1f/bli_axpyf_zen_int_5.c
Normal file
@@ -0,0 +1,326 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_daxpyf_zen_int_5
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 5;
|
||||
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 2;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double* restrict a0;
|
||||
double* restrict a1;
|
||||
double* restrict a2;
|
||||
double* restrict a3;
|
||||
double* restrict a4;
|
||||
|
||||
double* restrict y0;
|
||||
|
||||
v4df_t chi0v, chi1v, chi2v, chi3v;
|
||||
v4df_t chi4v;
|
||||
|
||||
v4df_t a00v, a01v, a02v, a03v;
|
||||
v4df_t a04v;
|
||||
|
||||
v4df_t a10v, a11v, a12v, a13v;
|
||||
v4df_t a14v;
|
||||
|
||||
v4df_t y0v, y1v;
|
||||
|
||||
double chi0, chi1, chi2, chi3;
|
||||
double chi4;
|
||||
|
||||
// If either dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
|
||||
PASTEMAC(d,scals)( *alpha, alpha_chi1 );
|
||||
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
|
||||
PASTEMAC(d,scals)( *alpha, alpha_chi1 );
|
||||
|
||||
f
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
y0 = y;
|
||||
|
||||
chi0 = *( x + 0*incx );
|
||||
chi1 = *( x + 1*incx );
|
||||
chi2 = *( x + 2*incx );
|
||||
chi3 = *( x + 3*incx );
|
||||
chi4 = *( x + 4*incx );
|
||||
|
||||
|
||||
// Scale each chi scalar by alpha.
|
||||
PASTEMAC(d,scals)( *alpha, chi0 );
|
||||
PASTEMAC(d,scals)( *alpha, chi1 );
|
||||
PASTEMAC(d,scals)( *alpha, chi2 );
|
||||
PASTEMAC(d,scals)( *alpha, chi3 );
|
||||
PASTEMAC(d,scals)( *alpha, chi4 );
|
||||
|
||||
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
|
||||
chi0v.v = _mm256_broadcast_sd( &chi0 );
|
||||
chi1v.v = _mm256_broadcast_sd( &chi1 );
|
||||
chi2v.v = _mm256_broadcast_sd( &chi2 );
|
||||
chi3v.v = _mm256_broadcast_sd( &chi3 );
|
||||
chi4v.v = _mm256_broadcast_sd( &chi4 );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
if ( inca == 1 && incy == 1 )
|
||||
{
|
||||
for ( i = 0; (i + 7) < m; i += 8 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
|
||||
|
||||
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
|
||||
|
||||
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
|
||||
|
||||
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
|
||||
|
||||
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
|
||||
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
|
||||
|
||||
y0 += n_iter_unroll * n_elem_per_reg;
|
||||
a0 += n_iter_unroll * n_elem_per_reg;
|
||||
a1 += n_iter_unroll * n_elem_per_reg;
|
||||
a2 += n_iter_unroll * n_elem_per_reg;
|
||||
a3 += n_iter_unroll * n_elem_per_reg;
|
||||
a4 += n_iter_unroll * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for( ; (i + 3) < m; i += 4 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
|
||||
y0 += n_elem_per_reg;
|
||||
a0 += n_elem_per_reg;
|
||||
a1 += n_elem_per_reg;
|
||||
a2 += n_elem_per_reg;
|
||||
a3 += n_elem_per_reg;
|
||||
a4 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( ; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += 1;
|
||||
a1 += 1;
|
||||
a2 += 1;
|
||||
a3 += 1;
|
||||
a4 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
103
kernels/zen2/2/bli_gemv_zen_ref.c
Normal file
103
kernels/zen2/2/bli_gemv_zen_ref.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
/* This routine assumes that the matrix is stored in column-major order
|
||||
* and does not have transpose.
|
||||
*/
|
||||
void bli_dgemv_zen_ref_c
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* restrict x0 = x;
|
||||
double* restrict y0 = y;
|
||||
double* restrict a0 = a;
|
||||
|
||||
dim_t i,j;
|
||||
|
||||
if((incy == 1) && (incx == 1))
|
||||
{
|
||||
|
||||
/* n==0 case will be handled in the framework,
|
||||
where it returns immediately.
|
||||
* For the first column of A, execute Y = alpha*A*X + beta*Y
|
||||
*/
|
||||
double x0_val = (x0[0]);
|
||||
PRAGMA_SIMD
|
||||
for(i = 0; i < m; i++)
|
||||
{
|
||||
(y0[i]) = (a0[i]) * (x0_val) * (*alpha) + y0[i] * (*beta);
|
||||
}
|
||||
a0 += lda;
|
||||
|
||||
/* For remaining columns of A, execute, Y = alpha*A*X + Y; */
|
||||
for(j = 1; j < n; j++)
|
||||
{
|
||||
const double xp = (x0[j]);
|
||||
PRAGMA_SIMD
|
||||
for(i = 0; i < m; i++)
|
||||
{
|
||||
(y0[i]) += (a0[i]) * xp * (*alpha);
|
||||
}
|
||||
a0 += lda;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for(j = 0; j < m; j++)
|
||||
PASTEMAC(d,scals)(*beta, *(y0+j*incy))
|
||||
|
||||
for(j = 0; j < n; j++)
|
||||
{
|
||||
const double xp = *(x0+j*incx);
|
||||
for(i = 0; i < m; i++)
|
||||
{
|
||||
*(y0 + i*incy) += (a0[j*lda+i]) * xp * (*alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
45
kernels/zen2/bli_kernels_zen2.h
Normal file
45
kernels/zen2/bli_kernels_zen2.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
// -- level-1f --
|
||||
AXPYF_KER_PROT( double, d, axpyf_zen_int_5 )
|
||||
|
||||
// -- level-2 --
|
||||
|
||||
//gemv(scalar code)
|
||||
GEMV_KER_PROT( double, d, gemv_zen_basic )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user