mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
DGEMV Optimizations for NO_TRANSPOSE Cases
- AVX512 specific DGEMV native kernels are added for Zen4/5
architectures to handle the NO_TRANSPOSE cases and are independent of
the AXPYF fused kernels.
- The following set of kernels biased towards the n-dimension perform
beta scaling of y vector within the kernel itself and handle cases
where n is less than 5:
- bli_dgemv_n_zen_int_32x8n_avx512( ... )
- bli_dgemv_n_zen_int_32x4n_avx512( ... )
- bli_dgemv_n_zen_int_32x2n_avx512( ... )
- bli_dgemv_n_zen_int_32x1n_avx512( ... )
- The bli_dgemv_n_zen_int_16mx8_avx512( ... ) is biased towards the
m-dimension and for this kernel beta scaling is handled beforehand
within the framework.
- Added unit-tests for the new kernels.
- AVX2 path for Zen/2/3 architectures still follows the old approach of
using fused kernel, namely AXPYF, to perform the GEMV operation.
AMD-Internal: [CPUPL-5560]
Change-Id: I22bc2a865cd28b9cdcb383e17d1ff38bdd28de79
This commit is contained in:
committed by
Arnav Sharma
parent
615789e196
commit
25e59fcbb9
3
.gitignore
vendored
3
.gitignore
vendored
@@ -65,3 +65,6 @@ bin/*
|
||||
*.exe
|
||||
|
||||
.vscode
|
||||
|
||||
# Gtestsuite build files
|
||||
gtestsuite/build/*
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -37,6 +38,9 @@
|
||||
// Define function types.
|
||||
#include "bli_l2_ft_unb.h"
|
||||
|
||||
// Define kernel function types for level-2 kernels.
|
||||
#include "bli_l2_ft_ker.h"
|
||||
|
||||
// Prototype object APIs (expert and non-expert).
|
||||
#include "bli_oapi_ex.h"
|
||||
#include "bli_l2_oapi.h"
|
||||
|
||||
60
frame/2/bli_l2_ft_ker.h
Normal file
60
frame/2/bli_l2_ft_ker.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Define kernel function types for level-2 kernels.
|
||||
//
|
||||
|
||||
// gemv
|
||||
#undef GENTDEF
|
||||
#define GENTDEF( ctype, ch, opname, tsuf ) \
|
||||
\
|
||||
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype* beta, \
|
||||
ctype* y, inc_t incy, \
|
||||
cntx_t* restrict cntx \
|
||||
);
|
||||
|
||||
// INSERT_GENTDEF( gemv )
|
||||
// Currently only generating the function type for double datatype.
|
||||
GENTDEF( double, d, gemv, _ft )
|
||||
@@ -257,35 +257,15 @@ void bli_dgemv_unf_var2
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
double* A1;
|
||||
double* x1;
|
||||
dim_t i;
|
||||
dim_t f, b_fuse;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// Memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double* y_temp = y;
|
||||
inc_t temp_incy = incy;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/*
|
||||
Fatbinary config amdzen when run on non-AMD X86 will query for
|
||||
@@ -294,6 +274,9 @@ void bli_dgemv_unf_var2
|
||||
*/
|
||||
arch_t id = bli_arch_query_id();
|
||||
|
||||
// b_fuse stores the fusing factor for AXPYF kernel.
|
||||
dim_t b_fuse;
|
||||
|
||||
/*
|
||||
Function pointer declaration for the functions
|
||||
that will be used by this API
|
||||
@@ -303,73 +286,91 @@ void bli_dgemv_unf_var2
|
||||
dscalv_ker_ft scalv_kr_ptr; // DSCALV
|
||||
dcopyv_ker_ft copyv_kr_ptr; // DCOPYV
|
||||
|
||||
switch (id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
bli_dgemv_n_avx512(
|
||||
transa,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_a, cs_a,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
bli_dgemv_n_avx2(
|
||||
transa,
|
||||
conjx,
|
||||
m,
|
||||
n,
|
||||
alpha,
|
||||
a, rs_a, cs_a,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
|
||||
default:
|
||||
// For non-Zen architectures, query the context if it is NULL
|
||||
if (cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
/*
|
||||
Query the context for the kernel function pointers for
|
||||
AXPYF, SCALV, COPYV and corresponding fusing
|
||||
factor of AXPYF kernel
|
||||
*/
|
||||
axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
|
||||
b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
|
||||
|
||||
scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
|
||||
|
||||
copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
|
||||
}
|
||||
|
||||
double* A1;
|
||||
double* x1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// Memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double* y_temp = y;
|
||||
inc_t temp_incy = incy;
|
||||
|
||||
/*
|
||||
Boolean to check if the y has been packed
|
||||
and memory needs to be freed in the end
|
||||
*/
|
||||
bool is_y_temp_buf_created = FALSE;
|
||||
|
||||
switch (id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
case BLIS_ARCH_ZEN4:
|
||||
#if defined(BLIS_KERNELS_ZEN4)
|
||||
/*
|
||||
Assign the AVX512 based kernel function pointers for
|
||||
AXPYF, SCALV, COPYV and corresponding fusing
|
||||
factor of DAXPYF kernel
|
||||
*/
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_elem, &n_iter, &rs_at, &cs_at );
|
||||
|
||||
axpyf_kr_ptr = bli_daxpyf_zen_int_avx512;
|
||||
b_fuse = 32;
|
||||
|
||||
scalv_kr_ptr = bli_dscalv_zen_int_avx512;
|
||||
|
||||
copyv_kr_ptr = bli_dcopyv_zen_int;
|
||||
|
||||
break;
|
||||
#endif
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
/*
|
||||
Assign the AVX2 based kernel function pointers for
|
||||
AXPYF, SCALV, COPYV and corresponding fusing
|
||||
factor of DAXPYF kernel
|
||||
*/
|
||||
|
||||
axpyf_kr_ptr = bli_daxpyf_zen_int_8;
|
||||
b_fuse = 8;
|
||||
|
||||
scalv_kr_ptr = bli_dscalv_zen_int10;
|
||||
|
||||
copyv_kr_ptr = bli_dcopyv_zen_int;
|
||||
|
||||
break;
|
||||
default:
|
||||
// For non-Zen architectures, query the context if it is NULL
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
/*
|
||||
Query the context for the kernel function pointers for
|
||||
AXPYF, SCALV, COPYV and corresponding fusing
|
||||
factor of AXPYF kernel
|
||||
*/
|
||||
axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DOUBLE, BLIS_AXPYF_KER, cntx);
|
||||
b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DOUBLE, BLIS_AF, cntx);
|
||||
|
||||
scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
|
||||
|
||||
copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
|
||||
}
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
/*
|
||||
If alpha is equal to zero, y is only scaled by beta and returned.
|
||||
In this case, packing and unpacking y will be costly and it is
|
||||
avoided.
|
||||
*/
|
||||
if ( (incy > 1) && (!bli_deq0( *alpha )))
|
||||
if ( (incy != 1) && (!bli_deq0( *alpha )))
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
@@ -398,13 +399,16 @@ void bli_dgemv_unf_var2
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufY.*/
|
||||
bli_pba_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY);
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY
|
||||
);
|
||||
|
||||
/*Continue packing Y if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc( &mem_bufY )))
|
||||
if ( bli_mem_is_alloc( &mem_bufY ) )
|
||||
{
|
||||
y_temp = bli_mem_buffer(&mem_bufY);
|
||||
|
||||
@@ -455,23 +459,23 @@ void bli_dgemv_unf_var2
|
||||
|
||||
for (i = 0; i < n_iter; i += f)
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
|
||||
f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
|
||||
|
||||
A1 = a + (i * cs_at);
|
||||
x1 = x + (i * incx);
|
||||
A1 = a + (i * cs_at);
|
||||
x1 = x + (i * incx);
|
||||
|
||||
axpyf_kr_ptr
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y_temp, temp_incy,
|
||||
cntx
|
||||
);
|
||||
axpyf_kr_ptr
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y_temp, temp_incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
if (is_y_temp_buf_created)
|
||||
@@ -1108,6 +1112,3 @@ void bli_cgemv_unf_var2
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -322,7 +322,7 @@ void dgemv_blis_impl
|
||||
* If the matrix dimensions are within 8x8 then calculate the result
|
||||
* using DGEMV Reference kernel.
|
||||
*/
|
||||
if ( m0 < 8 && n0 < 8 )
|
||||
if ( (m0 < 8 && n0 < 8) )
|
||||
{
|
||||
bli_dgemv_zen_ref
|
||||
(
|
||||
|
||||
@@ -104,6 +104,54 @@ TEST_P( dgemvGeneric, API )
|
||||
test_gemv<T>( storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, is_memory_test );
|
||||
}
|
||||
|
||||
// Unit-tests for NO_TRANSPOSE m-biased kernels
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_m,
|
||||
dgemvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('c'
|
||||
#ifndef TEST_BLAS_LIKE
|
||||
,'r'
|
||||
#endif
|
||||
), // storage format
|
||||
::testing::Values('n', 'c', 't'), // transa
|
||||
::testing::Values('n'), // conjx
|
||||
::testing::Values( 47 ), // m
|
||||
::testing::Values( 1, 2, 3, 4, 5, 6, 7, 8, 16, 44 ), // n
|
||||
::testing::Values( 0, 1, 2 ), // alpha
|
||||
::testing::Values( 0, 1, 2 ), // beta
|
||||
::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ), // stride size for x
|
||||
::testing::Values(gtint_t(1) ), // stride size for y
|
||||
::testing::Values(gtint_t(0), gtint_t(7) ), // increment to the leading dim of a
|
||||
::testing::Values(false, true) // is_memory_test
|
||||
),
|
||||
::gemvGenericPrint<T>()
|
||||
);
|
||||
|
||||
// Unit-tests for NO_TRANSPOSE n-biased kernels
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
dgemv_n_n,
|
||||
dgemvGeneric,
|
||||
::testing::Combine(
|
||||
::testing::Values('c'
|
||||
#ifndef TEST_BLAS_LIKE
|
||||
,'r'
|
||||
#endif
|
||||
), // storage format
|
||||
::testing::Values('n', 'c', 't'), // transa
|
||||
::testing::Values('n'), // conjx
|
||||
::testing::Values( 95 ), // m
|
||||
::testing::Values( 1, 2, 3, 4, 15 ), // n
|
||||
::testing::Values( 0, 1, 2 ), // alpha
|
||||
::testing::Values( 0, 1, 2 ), // beta
|
||||
::testing::Values(gtint_t(1), gtint_t(3), gtint_t(-1) ), // stride size for x
|
||||
::testing::Values(gtint_t(1) ), // stride size for y
|
||||
::testing::Values(gtint_t(0), gtint_t(7) ), // increment to the leading dim of a
|
||||
::testing::Values(false, true) // is_memory_test
|
||||
),
|
||||
::gemvGenericPrint<T>()
|
||||
);
|
||||
|
||||
// Black box testing.
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
BlackboxSmall,
|
||||
|
||||
228
kernels/zen/2/bli_gemv_avx2.c
Normal file
228
kernels/zen/2/bli_gemv_avx2.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/**
|
||||
* bli_dgemv_n_avx2(...) handles cases where op(A) = NO_TRANSPOSE for Zen/2/3
|
||||
* architectures and is based on the previous approach of using the fused
|
||||
* kernels, namely AXPYF, to perform the GEMV operation.
|
||||
*/
|
||||
void bli_dgemv_n_avx2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_4 );
|
||||
double* A1;
|
||||
double* x1;
|
||||
dim_t i;
|
||||
dim_t f, b_fuse;
|
||||
dim_t m0, n0;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
// Memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double* y_temp = y;
|
||||
inc_t temp_incy = incy;
|
||||
|
||||
// Boolean to check if y vector is packed and memory needs to be freed.
|
||||
bool is_y_temp_buf_created = FALSE;
|
||||
|
||||
// Update dimensions and strides based on op(A).
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&m0, &n0, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// Function pointer declaration for the functions that will be used.
|
||||
daxpyf_ker_ft axpyf_kr_ptr; // DAXPYF
|
||||
dscal2v_ker_ft scal2v_kr_ptr; // DSCAL2V
|
||||
dscalv_ker_ft scalv_kr_ptr; // DSCALV
|
||||
dcopyv_ker_ft copyv_kr_ptr; // DCOPYV
|
||||
|
||||
// Setting the fuse factor based on bli_daxpyf_zen_int_8 kernel.
|
||||
b_fuse = 8;
|
||||
axpyf_kr_ptr = bli_daxpyf_zen_int_8; // DAXPYF
|
||||
scal2v_kr_ptr = bli_dscal2v_zen_int; // DSCAL2V
|
||||
scalv_kr_ptr = bli_dscalv_zen_int10; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
|
||||
|
||||
/*
|
||||
If alpha is equal to zero, y is only scaled by beta and returned.
|
||||
In this case, packing and unpacking y will be costly and it is
|
||||
avoided.
|
||||
*/
|
||||
if ( (incy != 1) && (!bli_deq0( *alpha )))
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_pba_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
|
||||
mem_bufY.buf_type = 0; mem_bufY.size = 0;
|
||||
mem_bufY.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed. Following are initializations for rntm */
|
||||
bli_rntm_init_from_global( &rntm );
|
||||
bli_rntm_set_num_threads_only( 1, &rntm );
|
||||
bli_pba_rntm_set_pba( &rntm );
|
||||
|
||||
//calculate the size required for m0 double elements in vector Y.
|
||||
size_t buffer_size = m0 * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_n_avx2(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/* Acquire a Buffer(m0*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufY. */
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY
|
||||
);
|
||||
|
||||
/* Continue packing Y if buffer memory is allocated. */
|
||||
if ( bli_mem_is_alloc( &mem_bufY ) )
|
||||
{
|
||||
y_temp = bli_mem_buffer(&mem_bufY);
|
||||
|
||||
// Stride of vector y_temp
|
||||
temp_incy = 1;
|
||||
|
||||
// Query the context if it is NULL. This will be necessary for Zen architectures
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx();
|
||||
|
||||
scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCAL2V_KER, cntx);
|
||||
|
||||
// Invoke the SCAL2V function using the function pointer
|
||||
scal2v_kr_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
beta,
|
||||
y, incy,
|
||||
y_temp, temp_incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
/*
|
||||
Set y is packed as the memory allocation was successful
|
||||
and contents have been scaled and copied to a temp buffer
|
||||
*/
|
||||
is_y_temp_buf_created = TRUE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invoke the DSCALV function using the function pointer
|
||||
scalv_kr_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
beta,
|
||||
y_temp, temp_incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
if( bli_deq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return;
|
||||
}
|
||||
|
||||
for ( i = 0; i < n0; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f(i, n0, b_fuse);
|
||||
|
||||
A1 = a + (i * cs_at);
|
||||
x1 = x + (i * incx);
|
||||
|
||||
axpyf_kr_ptr
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
m0,
|
||||
f,
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y_temp, temp_incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
// If y was packed into y_temp, copy the contents back to y and free memory.
|
||||
if ( is_y_temp_buf_created )
|
||||
{
|
||||
// Store the result from unit strided y_buf to non-unit strided Y.
|
||||
// Invoke the COPYV function using the function pointer.
|
||||
copyv_kr_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
m0,
|
||||
y_temp, temp_incy,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_n_avx2(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_pba_release( &rntm , &mem_bufY );
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_4 );
|
||||
}
|
||||
@@ -499,3 +499,17 @@ void bli_dgemv_zen_ref
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
);
|
||||
|
||||
void bli_dgemv_n_avx2
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
4130
kernels/zen4/2/bli_gemv_n_zen_int_avx512.c
Normal file
4130
kernels/zen4/2/bli_gemv_n_zen_int_avx512.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -109,6 +109,34 @@ DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_8_avx512 )
|
||||
DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_4_avx512 )
|
||||
DOTXF_KER_PROT( dcomplex, z, dotxf_zen_int_2_avx512 )
|
||||
|
||||
// gemv (intrinsics)
|
||||
// dgemv_n kernels for handling op(A) = 'n', i.e., transa = 'n' cases.
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx8_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx7_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx6_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx5_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx4_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx3_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx2_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16mx1_avx512 )
|
||||
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x8n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x8n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x8n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx8n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x4n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x4n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x4n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx4n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x2n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x2n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x2n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx2n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_32x1n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_16x1n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_8x1n_avx512 )
|
||||
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx1n_avx512 )
|
||||
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_16x14)
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_16x14)
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen4_asm_8x24)
|
||||
@@ -358,3 +386,17 @@ void bli_dynamic_blkszs_zen4
|
||||
|
||||
// function for resetting zmm registers after L3 apis
|
||||
void bli_zero_zmm();
|
||||
|
||||
void bli_dgemv_n_avx512
|
||||
(
|
||||
trans_t transa,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
double* alpha,
|
||||
double* a, inc_t rs_a, inc_t cs_a,
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user