DGEMV Optimizations for NO_TRANSPOSE Cases

- AVX512 specific DGEMV native kernels are added for Zen4/5
  architectures to handle the NO_TRANSPOSE cases and are independent of
  the AXPYF fused kernels.
- The following set of kernels biased towards the n-dimension perform
  beta scaling of y vector within the kernel itself and handle cases
  where n is less than 5:
    - bli_dgemv_n_zen_int_32x8n_avx512( ... )
    - bli_dgemv_n_zen_int_32x4n_avx512( ... )
    - bli_dgemv_n_zen_int_32x2n_avx512( ... )
    - bli_dgemv_n_zen_int_32x1n_avx512( ... )
- The bli_dgemv_n_zen_int_16mx8_avx512( ... ) is biased towards the
  m-dimension and for this kernel beta scaling is handled beforehand
  within the framework.
- Added unit-tests for the new kernels.
- AVX2 path for Zen/2/3 architectures still follows the old approach of
  using fused kernel, namely AXPYF, to perform the GEMV operation.

AMD-Internal: [CPUPL-5560]
Change-Id: I22bc2a865cd28b9cdcb383e17d1ff38bdd28de79
This commit is contained in:
Arnav Sharma
2024-10-04 14:44:21 +05:30
committed by Arnav Sharma
parent 615789e196
commit 25e59fcbb9
10 changed files with 4633 additions and 103 deletions

3
.gitignore vendored
View File

@@ -65,3 +65,6 @@ bin/*
*.exe
.vscode
# Gtestsuite build files
gtestsuite/build/*

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -37,6 +38,9 @@
// Define function types.
#include "bli_l2_ft_unb.h"
// Define kernel function types for level-2 kernels.
#include "bli_l2_ft_ker.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l2_oapi.h"

60
frame/2/bli_l2_ft_ker.h Normal file
View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define kernel function types for level-2 kernels.
//
// gemv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* restrict cntx \
);
// INSERT_GENTDEF( gemv )
// Currently only generating the function type for double datatype.
GENTDEF( double, d, gemv, _ft )

View File

@@ -257,35 +257,15 @@ void bli_dgemv_unf_var2
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
double* A1;
double* x1;
dim_t i;
dim_t f, b_fuse;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
// Memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double* y_temp = y;
inc_t temp_incy = incy;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
/*
Fatbinary config amdzen when run on non-AMD X86 will query for
@@ -294,6 +274,9 @@ void bli_dgemv_unf_var2
*/
arch_t id = bli_arch_query_id();
// b_fuse stores the fusing factor for AXPYF kernel.
dim_t b_fuse;
/*
Function pointer declaration for the functions
that will be used by this API
@@ -303,73 +286,91 @@ void bli_dgemv_unf_var2
dscalv_ker_ft scalv_kr_ptr; // DSCALV
dcopyv_ker_ft copyv_kr_ptr; // DCOPYV
switch (id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
bli_dgemv_n_avx512(
transa,
conjx,
m,
n,
alpha,
a, rs_a, cs_a,
x, incx,
beta,
y, incy,
cntx
);
return;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
bli_dgemv_n_avx2(
transa,
conjx,
m,
n,
alpha,
a, rs_a, cs_a,
x, incx,
beta,
y, incy,
cntx
);
return;
default:
// For non-Zen architectures, query the context if it is NULL
if (cntx == NULL) cntx = bli_gks_query_cntx();
/*
Query the context for the kernel function pointers for
AXPYF, SCALV, COPYV and corresponding fusing
factor of AXPYF kernel
*/
axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
}
double* A1;
double* x1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
// Memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double* y_temp = y;
inc_t temp_incy = incy;
/*
Boolean to check if the y has been packed
and memory needs to be freed in the end
*/
bool is_y_temp_buf_created = FALSE;
switch (id)
{
case BLIS_ARCH_ZEN5:
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
/*
Assign the AVX512 based kernel function pointers for
AXPYF, SCALV, COPYV and corresponding fusing
factor of DAXPYF kernel
*/
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_elem, &n_iter, &rs_at, &cs_at );
axpyf_kr_ptr = bli_daxpyf_zen_int_avx512;
b_fuse = 32;
scalv_kr_ptr = bli_dscalv_zen_int_avx512;
copyv_kr_ptr = bli_dcopyv_zen_int;
break;
#endif
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
/*
Assign the AVX2 based kernel function pointers for
AXPYF, SCALV, COPYV and corresponding fusing
factor of DAXPYF kernel
*/
axpyf_kr_ptr = bli_daxpyf_zen_int_8;
b_fuse = 8;
scalv_kr_ptr = bli_dscalv_zen_int10;
copyv_kr_ptr = bli_dcopyv_zen_int;
break;
default:
// For non-Zen architectures, query the context if it is NULL
if(cntx == NULL) cntx = bli_gks_query_cntx();
/*
Query the context for the kernel function pointers for
AXPYF, SCALV, COPYV and corresponding fusing
factor of AXPYF kernel
*/
axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
}
conja = bli_extract_conj( transa );
/*
If alpha is equal to zero, y is only scaled by beta and returned.
In this case, packing and unpacking y will be costly and it is
avoided.
*/
if ( (incy > 1) && (!bli_deq0( *alpha )))
if ( (incy != 1) && (!bli_deq0( *alpha )))
{
/*
Initialize mem pool buffer to NULL and size to 0
@@ -398,13 +399,16 @@ void bli_dgemv_unf_var2
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufY.*/
bli_pba_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY);
bli_pba_acquire_m
(
&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY
);
/*Continue packing Y if buffer memory is allocated*/
if ((bli_mem_is_alloc( &mem_bufY )))
if ( bli_mem_is_alloc( &mem_bufY ) )
{
y_temp = bli_mem_buffer(&mem_bufY);
@@ -455,23 +459,23 @@ void bli_dgemv_unf_var2
for (i = 0; i < n_iter; i += f)
{
f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
A1 = a + (i * cs_at);
x1 = x + (i * incx);
A1 = a + (i * cs_at);
x1 = x + (i * incx);
axpyf_kr_ptr
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y_temp, temp_incy,
cntx
);
axpyf_kr_ptr
(
conja,
conjx,
n_elem,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y_temp, temp_incy,
cntx
);
}
if (is_y_temp_buf_created)
@@ -1108,6 +1112,3 @@ void bli_cgemv_unf_var2
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}

View File

@@ -322,7 +322,7 @@ void dgemv_blis_impl
* If the matrix dimensions are within 8x8 then calculate the result
* using DGEMV Reference kernel.
*/
if ( m0 < 8 && n0 < 8 )
if ( (m0 < 8 && n0 < 8) )
{
bli_dgemv_zen_ref
(

View File

@@ -104,6 +104,54 @@ TEST_P( dgemvGeneric, API )
test_gemv<T>( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, is_memory_test );
}
// Unit-tests for NO_TRANSPOSE m-biased kernels
INSTANTIATE_TEST_SUITE_P(
dgemv_n_m,
dgemvGeneric,
::testing::Combine(
::testing::Values('c'
#ifndef TEST_BLAS_LIKE
,'r'
#endif
), // storage format
::testing::Values('n', 'c', 't'), // transa
::testing::Values('n'), // conjx
::testing::Values( 47 ), // m
::testing::Values( 1, 2, 3, 4, 5, 6, 7, 8, 16, 44 ), // n
::testing::Values( 0, 1, 2 ), // alpha
::testing::Values( 0, 1, 2 ), // beta
::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ), // stride size for x
::testing::Values(gtint_t(1) ), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7) ), // increment to the leading dim of a
::testing::Values(false, true) // is_memory_test
),
::gemvGenericPrint<T>()
);
// Unit-tests for NO_TRANSPOSE n-biased kernels
INSTANTIATE_TEST_SUITE_P(
dgemv_n_n,
dgemvGeneric,
::testing::Combine(
::testing::Values('c'
#ifndef TEST_BLAS_LIKE
,'r'
#endif
), // storage format
::testing::Values('n', 'c', 't'), // transa
::testing::Values('n'), // conjx
::testing::Values( 95 ), // m
::testing::Values( 1, 2, 3, 4, 15 ), // n
::testing::Values( 0, 1, 2 ), // alpha
::testing::Values( 0, 1, 2 ), // beta
::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ), // stride size for x
::testing::Values(gtint_t(1) ), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7) ), // increment to the leading dim of a
::testing::Values(false, true) // is_memory_test
),
::gemvGenericPrint<T>()
);
// Black box testing.
INSTANTIATE_TEST_SUITE_P(
BlackboxSmall,

View File

@@ -0,0 +1,228 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
/**
* bli_dgemv_n_avx2(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
* architectures and is based on the previous approach of using the fused
* kernels, namely AXPYF, to perform the GEMV operation.
*/
void bli_dgemv_n_avx2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
)
{
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_4 );
double* A1;
double* x1;
dim_t i;
dim_t f, b_fuse;
dim_t m0, n0;
inc_t rs_at, cs_at;
conj_t conja;
// Memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double* y_temp = y;
inc_t temp_incy = incy;
// Boolean to check if y vector is packed and memory needs to be freed.
bool is_y_temp_buf_created = FALSE;
// Update dimensions and strides based on op(A).
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&m0, &n0, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
// Function pointer declaration for the functions that will be used.
daxpyf_ker_ft axpyf_kr_ptr; // DAXPYF
dscal2v_ker_ft scal2v_kr_ptr; // DSCAL2V
dscalv_ker_ft scalv_kr_ptr; // DSCALV
dcopyv_ker_ft copyv_kr_ptr; // DCOPYV
// Setting the fuse factor based on bli_daxpyf_zen_int_8 kernel.
b_fuse = 8;
axpyf_kr_ptr = bli_daxpyf_zen_int_8; // DAXPYF
scal2v_kr_ptr = bli_dscal2v_zen_int; // DSCAL2V
scalv_kr_ptr = bli_dscalv_zen_int10; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
/*
If alpha is equal to zero, y is only scaled by beta and returned.
In this case, packing and unpacking y will be costly and it is
avoided.
*/
if ( (incy != 1) && (!bli_deq0( *alpha )))
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_pba_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
mem_bufY.buf_type = 0; mem_bufY.size = 0;
mem_bufY.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed. Following are initializations for rntm */
bli_rntm_init_from_global( &rntm );
bli_rntm_set_num_threads_only( 1, &rntm );
bli_pba_rntm_set_pba( &rntm );
//calculate the size required for m0 double elements in vector Y.
size_t buffer_size = m0 * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_n_avx2(): get mem pool block\n" );
#endif
/* Acquire a Buffer(m0*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufY. */
bli_pba_acquire_m
(
&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY
);
/* Continue packing Y if buffer memory is allocated. */
if ( bli_mem_is_alloc( &mem_bufY ) )
{
y_temp = bli_mem_buffer(&mem_bufY);
// Stride of vector y_temp
temp_incy = 1;
// Query the context if it is NULL. This will be necessary for Zen architectures
if(cntx == NULL) cntx = bli_gks_query_cntx();
scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx);
// Invoke the SCAL2V function using the function pointer
scal2v_kr_ptr
(
BLIS_NO_CONJUGATE,
m0,
beta,
y, incy,
y_temp, temp_incy,
cntx
);
/*
Set y is packed as the memory allocation was successful
and contents have been scaled and copied to a temp buffer
*/
is_y_temp_buf_created = TRUE;
}
}
else
{
// Invoke the DSCALV function using the function pointer
scalv_kr_ptr
(
BLIS_NO_CONJUGATE,
m0,
beta,
y_temp, temp_incy,
cntx
);
}
if( bli_deq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
for ( i = 0; i < n0; i += f )
{
f = bli_determine_blocksize_dim_f(i, n0, b_fuse);
A1 = a + (i * cs_at);
x1 = x + (i * incx);
axpyf_kr_ptr
(
conja,
conjx,
m0,
f,
alpha,
A1, rs_at, cs_at,
x1, incx,
y_temp, temp_incy,
cntx
);
}
// If y was packed into y_temp, copy the contents back to y and free memory.
if ( is_y_temp_buf_created )
{
// Store the result from unit strided y_buf to non-unit strided Y.
// Invoke the COPYV function using the function pointer.
copyv_kr_ptr
(
BLIS_NO_CONJUGATE,
m0,
y_temp, temp_incy,
y, incy,
cntx
);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_n_avx2(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_pba_release( &rntm , &mem_bufY );
}
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_4 );
}

View File

@@ -499,3 +499,17 @@ void bli_dgemv_zen_ref
double* restrict y, inc_t incy,
cntx_t* restrict cntx
);
void bli_dgemv_n_avx2
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
);

File diff suppressed because it is too large Load Diff

View File

@@ -109,6 +109,34 @@ DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_8_avx512 )
DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_4_avx512 )
DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_2_avx512 )
// gemv (intrinsics)
// dgemv_n kernels for handling op(A) = 'n', i.e., transa = 'n' cases.
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx8_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx7_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx6_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx5_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx4_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx3_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx2_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx1_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x8n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x8n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x8n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx8n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x4n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x4n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x4n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx4n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x2n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x2n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x2n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx2n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x1n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x1n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x1n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx1n_avx512 )
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_16x14)
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_16x14)
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen4_asm_8x24)
@@ -358,3 +386,17 @@ void bli_dynamic_blkszs_zen4
// function for resetting zmm registers after L3 apis
void bli_zero_zmm();
void bli_dgemv_n_avx512
(
trans_t transa,
conj_t conjx,
dim_t m,
dim_t n,
double* alpha,
double* a, inc_t rs_a, inc_t cs_a,
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
);