Implemented Multithreading Support and Optimization of DGEMV API (#10)

- Implemented multithreading framework for the DGEMV API on Zen architectures. Architecture specific AOCL-dynamic logic determines the optimal number of threads for improved performance.

- The condition check for the value of beta is optimized by utilizing masked operations. The mask value is set based on value of beta, and the masked operations are applied when the vector y is loaded or scaled with beta.

AMD-Internal: [CPUPL-6746]
This commit is contained in:
S, Hari Govind
2025-06-17 12:39:48 +05:30
committed by GitHub
parent 26e5c63781
commit e097346658
10 changed files with 1370 additions and 1722 deletions

View File

@@ -216,31 +216,13 @@ void bli_dgemv_unf_var1
inc_t lda = cs_a, inca = rs_a;
conj_t conja;
double *a_buf = a;
double *x_buf = x;
double *y_buf = y;
inc_t buf_incx = incx;
inc_t buf_incy = incy;
// 'bli_dgemv_unf_var1' is dot-based kernel. This kernel is called for the following cases:
//
// When op(A) = n and row-storage( lda = rs_a ), we compute dot product as y[i] = <A(i,:), x>, i = 0:m-1.
// gemv dot kernel always computes dot-product along the columns of A, we interchange m and n. Here m0 = n, n0 = m.
//
// op(A) = n -> lda = rs_a;
// inca = cs_a;
// m0 = n;
// n0 = m;
//
// when op(A) = t and col-storage( lda = cs_a ), we compute dot product as y[i] = <A(:, i), x>, i = 0:n-1. Anyways
// the kernel computes dot along the columns of A, we don't interchange m & n, so here m0 = m and n0 = n.
//
// op(A) = t -> lda = cs_a;
// inca = rs_a;
// m0 = m;
// n0 = n;
//
// Invoking the reference kernel to handle general stride.
if ( ( rs_a != 1 ) && ( cs_a != 1 ) )
{
@@ -261,10 +243,29 @@ void bli_dgemv_unf_var1
return;
}
// 'bli_dgemv_unf_var1' is dot-based kernel. This kernel is called for the following cases:
//
// When op(A) = n and row-storage( lda = rs_a ), we compute dot product as y[i] = <A(i,:), x>, i = 0:m-1.
// gemv dot kernel always computes dot-product along the columns of A, we interchange m and n. Here m0 = n, n0 = m.
//
// op(A) = n -> lda = rs_a;
// inca = cs_a;
// m0 = n;
// n0 = m;
//
// when op(A) = t and col-storage( lda = cs_a ), we compute dot product as y[i] = <A(:, i), x>, i = 0:n-1. Anyways
// the kernel computes dot along the columns of A, we don't interchange m & n, so here m0 = m and n0 = n.
//
// op(A) = t -> lda = cs_a;
// inca = rs_a;
// m0 = m;
// n0 = n;
//
bli_set_dims_incs_with_trans(transa,
m, n, rs_a, cs_a,
&n0, &m0, &lda, &inca);
// Extract the conjugation from transa.
conja = bli_extract_conj(transa);
//memory pool declarations for packing vector X and Y.
@@ -288,21 +289,34 @@ void bli_dgemv_unf_var1
*/
arch_t id = bli_arch_query_id();
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
// Setting the threshold to invoke the fast-path
// The fast-path is intended to directly call the kernel
// in case the criteria for single threaded execution is met.
dim_t fast_path_thresh = 0;
#endif
switch (id)
{
case BLIS_ARCH_ZEN5:
#if defined(BLIS_KERNELS_ZEN5)
gemv_kr_ptr = bli_dgemv_t_zen_int_avx512;
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen5_asm_avx512; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 12000;
#endif
break;
#endif
case BLIS_ARCH_ZEN4:
#if defined(BLIS_KERNELS_ZEN4)
gemv_kr_ptr = bli_dgemv_t_zen_int_avx512;
gemv_kr_ptr = bli_dgemv_t_zen4_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int_avx512; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen4_asm_avx512; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 11000;
#endif
break;
#endif
@@ -310,9 +324,12 @@ void bli_dgemv_unf_var1
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
gemv_kr_ptr = bli_dgemv_t_zen_int_avx2;
gemv_kr_ptr = bli_dgemv_t_zen_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 13000;
#endif
break;
default:
@@ -329,20 +346,22 @@ void bli_dgemv_unf_var1
PASTECH(d,dotxf_ker_ft) kfp_df;
// Query the context for the kernel function pointer and fusing factor.
// Query the context for the ddotxf kernel function pointer and fusing factor.
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );
dim_t b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );
//
for ( i = 0; i < n0; i += f )
{
// Determine the blocksize for the current iteration.
f = bli_determine_blocksize_dim_f( i, n0, b_fuse );
A1 = a + ( i * lda ) + ( 0 * inca );
// Calculate the pointers to the current block of A, x, and y.
A1 = a_buf + ( i * lda ) + ( 0 * inca );
x1 = x_buf;
y1 = y + ( i * incy );
y1 = y_buf + ( i * incy );
// y1 = beta * y1 + alpha * A1 * x;
// kfp_df is a function pointer to the dotxf kernel
kfp_df
(
conja,
@@ -415,7 +434,7 @@ void bli_dgemv_unf_var1
// Using unit-stride for y_temp vector.
buf_incy = 1;
// Invoke the SCAL2V function using the function pointer.
// Invoke the COPYV function using the function pointer.
copyv_kr_ptr
(
BLIS_NO_CONJUGATE,
@@ -493,6 +512,7 @@ void bli_dgemv_unf_var1
// stride of vector x_buf =1
buf_incx = 1;
// Invoke the COPYV function using the function pointer.
copyv_kr_ptr
(
BLIS_NO_CONJUGATE,
@@ -502,27 +522,120 @@ void bli_dgemv_unf_var1
cntx
);
// Set x is packed as the memory allocation was successful
// and contents have been copied to a temp buffer.
is_x_temp_buf_created = TRUE;
}
}
// Calling the selected kernel for the API
// If the increments of x and y are unit stride, we can use the
// optimized kernel path. The optimized kernel does not support
// non-unit stride for x and y.
if ( buf_incx == 1 && buf_incy == 1 )
{
gemv_kr_ptr
(
conja,
conjx,
m0,
n0,
alpha,
a, inca, lda,
x_buf, buf_incx,
beta,
y_buf, buf_incy,
cntx
);
#if defined(BLIS_ENABLE_OPENMP)
// If the problem size is small, we can use a fast-path to avoid
// the overhead of threading.
if ( ((n0 * m0) <= fast_path_thresh) || ((n0 < 100) && (m0 < 100)) )
{
#endif
// Call the DGEMV kernel directly with the packed buffers.
gemv_kr_ptr
(
conja,
conjx,
m0,
n0,
alpha,
a_buf, inca, lda,
x_buf, buf_incx,
beta,
y_buf, buf_incy,
cntx
);
#if defined(BLIS_ENABLE_OPENMP)
}
else
{
// Initializing nt as 1 to avoid compiler warnings
dim_t nt = 1;
/*
For the given problem size and architecture, the function
returns the optimum number of threads with AOCL dynamic enabled
else it returns the number of threads requested by the user.
*/
bli_nthreads_l2
(
BLIS_GEMV_KER,
BLIS_DOUBLE,
BLIS_TRANSPOSE,
id,
n0,
m0,
&nt
);
_Pragma("omp parallel num_threads(nt)")
{
dim_t start, end;
thrinfo_t thread;
// The factor by which the size should be a multiple during thread partition.
// The main loop of the kernel can handle 8 elements at a time hence 8 is selected for block_size.
dim_t block_size = 8;
// Get the thread ID
bli_thrinfo_set_work_id( omp_get_thread_num(), &thread );
// Get the actual number of threads spawned
bli_thrinfo_set_n_way( omp_get_num_threads(), &thread );
/*
Calculate the compute range (start and end) for the current thread
based on the actual number of threads spawned
*/
bli_thread_range_sub
(
&thread,
n0,
block_size,
FALSE,
&start,
&end
);
// Calculating the value of n for the particular thread
dim_t n_thread_local = end - start;
// Calculating thread specific pointers
double *a_thread_local = a_buf + (start * lda);
double *y_thread_local = y_buf + start;
double *x_thread_local = x_buf;
// Call the DGEMV kernel with the thread-local pointers.
gemv_kr_ptr
(
conja,
conjx,
m0,
n_thread_local,
alpha,
a_thread_local, inca, lda,
x_thread_local, buf_incx,
beta,
y_thread_local, buf_incy,
cntx
);
}
}
#endif
}
// If the increments of x and y are not unit stride, we call the reference kernel.
else
{
bli_dgemv_zen_ref

View File

@@ -2784,3 +2784,270 @@ void bli_nthreads_l1f
#endif
}
/*
Functionality:
--------------
This function decides the AOCL dynamic logic for L2 dgemv API based on the
architecture ID and size of the input variable.
Function signature
-------------------
This function takes the following input:
* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
* 'm_elem' - Number of rows in the matrix
* 'n_elem' - Number of columns in the matrix
* 'variant' - Transpose / Non-Transpose variant of the kernel
* 'nt_ideal' - Ideal number of threads
Exception
----------
1. For non-Zen architectures, return -1. The expectation is that this is handled
in the higher layer
*/
BLIS_INLINE void aocl_dgemv_dynamic
(
arch_t arch_id,
dim_t m_elem,
dim_t n_elem,
trans_t variant,
dim_t* nt_ideal
)
{
// Pick the AOCL dynamic logic based on the
// architecture ID
dim_t size = n_elem * m_elem;
// AOCL dynamic logic for transpose case
if (variant == BLIS_TRANSPOSE)
{
switch ( arch_id )
{
case BLIS_ARCH_ZEN5:
if ( size < 12000 )
*nt_ideal = 1;
else if ( size < 27500 )
*nt_ideal = 4;
else if ( size < 758000 )
*nt_ideal = 8;
else if ( size < 1580000 )
*nt_ideal = 16;
else if ( size < 3390000 )
*nt_ideal = 32;
else if ( size < 10140000 )
*nt_ideal = 64;
else if ( size < 14600000 )
*nt_ideal = 96;
else
// For sizes in this range, AOCL dynamic does not make any change
*nt_ideal = -1;
break;
case BLIS_ARCH_ZEN4:
if ( size < 11000 )
*nt_ideal = 1;
else if ( size < 34500 )
*nt_ideal = 4;
else if ( size < 707000 )
*nt_ideal = 8;
else if ( size < 1870000 )
*nt_ideal = 16;
else if ( size < 4800000 )
*nt_ideal = 32;
else if ( size < 9000000 )
*nt_ideal = 64;
else
// For sizes in this range, AOCL dynamic does not make any change
*nt_ideal = -1;
break;
case BLIS_ARCH_ZEN:
case BLIS_ARCH_ZEN2:
case BLIS_ARCH_ZEN3:
if ( size < 13000 )
*nt_ideal = 1;
else if ( size < 17300 )
*nt_ideal = 4;
else if ( size < 300000 )
*nt_ideal = 8;
else if ( size < 640000 )
*nt_ideal = 16;
else if ( size < 1700000 )
*nt_ideal = 32;
else
// For sizes in this range, AOCL dynamic does not make any change
*nt_ideal = -1;
break;
default:
/*
Without this default condition, compiler will throw
a warning saying other conditions are not handled
*/
/*
For other architectures, AOCL dynamic does not make any change
*/
*nt_ideal = -1;
}
}
// AOCL dynamic logic for non-transpose case
else
{
*nt_ideal = -1;
}
}
/*
Functionality:
--------------
This function does the following:
1. Reads the number of threads requested by the user from the rntm variable
2. Acts as the gateway to the AOCL dynamic logic if AOCL dynamic is enabled
and alters the count of the number of threads accordingly
Function signature
-------------------
This function takes the following input:
* 'ker_id' - ID of kernel invoking this function
* 'datatype' - Datatype of kernel
* 'variant' - Transpose / Non-Transpose variant of the kernel
* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
* 'm_elem' - Number of row in the matrix
* 'n_elem' - Number of columns in the matrix
* 'nt_ideal' - Ideal number of threads
Exception
----------
None
*/
void bli_nthreads_l2
(
l2kr_t ker_id,
num_t data_type,
trans_t variant,
arch_t arch_id,
dim_t m_elem,
dim_t n_elem,
dim_t* nt_ideal
)
{
#ifdef AOCL_DYNAMIC
/*
This code sections dispatches the AOCL dynamic logic kernel for
L2 APIs based on the kernel ID and the data type.
*/
// Function pointer to AOCL Dynamic logic kernel
void (*aocl_dynamic_func_l2)(arch_t, dim_t, dim_t, trans_t, dim_t* ) = NULL;
// Pick the aocl dynamic thread decision kernel based on the kernel ID
switch (ker_id)
{
case BLIS_GEMV_KER:
if ( data_type == BLIS_DOUBLE )
{
// Function for DGEMV
aocl_dynamic_func_l2 = aocl_dgemv_dynamic;
}
else
{
*nt_ideal = -1;
}
break;
default:
/*
For kernels that do no have AOCL dynamic logic,
use the number of threads requested by the user.
*/
*nt_ideal = -1;
}
/*
For APIs that do not have AOCL dynamic
logic, aocl_dynamic_func_l2 will be NULL.
*/
if( aocl_dynamic_func_l2 != NULL)
{
// Call the AOCL dynamic logic kernel
aocl_dynamic_func_l2
(
arch_id,
m_elem,
n_elem,
variant,
nt_ideal
);
if (*nt_ideal == 1)
{
// Return early when the number of threads is 1
return;
}
}
#endif
// Initialized to avoid compiler warning
rntm_t rntm_local;
// Initialize a local runtime with global settings.
bli_rntm_init_from_global(&rntm_local);
// Query the total number of threads from the rntm_t object.
dim_t nt_rntm = bli_rntm_num_threads(&rntm_local);
if (nt_rntm <= 0)
{
// nt is less than one if BLIS manual setting of parallelism
// has been used. Parallelism here will be product of values.
nt_rntm = bli_rntm_calc_num_threads(&rntm_local);
}
#ifdef AOCL_DYNAMIC
// Calculate the actual number of threads that will be spawned
if (*nt_ideal != -1)
{
// The if block is executed for all Zen architectures
*nt_ideal = bli_min(nt_rntm, *nt_ideal);
}
else
{
/*
For non-Zen architectures and very large sizes,
spawn the actual number of threads requested
*/
*nt_ideal = nt_rntm;
}
/*
When the number of element to be processed is less
than the number of threads spawn n_elem number of threads.
*/
if (n_elem < *nt_ideal)
{
*nt_ideal = n_elem;
}
#else
// Calculate the actual number of threads that will be spawned
*nt_ideal = nt_rntm;
#endif
}

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -137,6 +137,17 @@ void bli_nthreads_l1f
dim_t* nt_ideal
);
void bli_nthreads_l2
(
l2kr_t ker_id,
num_t data_type,
trans_t variant,
arch_t arch_id,
dim_t m_elem,
dim_t n_elem,
dim_t* nt_ideal
);
// Runtime object type (defined in bli_type_defs.h)
/*

View File

@@ -668,6 +668,13 @@ typedef enum
#define BLIS_NUM_LEVEL1F_KERS 5
typedef enum
{
BLIS_GEMV_KER = 0,
BLIS_TRSV_KER
} l2kr_t;
#define BLIS_NUM_LEVEL2_KERS 2
typedef enum
{

View File

@@ -76,22 +76,22 @@
#define K_bli_zgemmsup_cd_zen4_asm_8x2 1
#define K_bli_zgemmsup_cd_zen4_asm_8x4 1
#define K_bli_dgemmsup_rv_zen4_asm_24x8m_new 1
#define K_bli_dgemv_t_zen_int_avx2 1
#define K_bli_dgemv_t_zen_int_16x7m_avx2 1
#define K_bli_dgemv_t_zen_int_16x6m_avx2 1
#define K_bli_dgemv_t_zen_int_16x5m_avx2 1
#define K_bli_dgemv_t_zen_int_16x4m_avx2 1
#define K_bli_dgemv_t_zen_int_16x3m_avx2 1
#define K_bli_dgemv_t_zen_int_16x2m_avx2 1
#define K_bli_dgemv_t_zen_int_16x1m_avx2 1
#define K_bli_dgemv_t_zen_int_avx512 1
#define K_bli_dgemv_t_zen_int_32x7m_avx512 1
#define K_bli_dgemv_t_zen_int_32x6m_avx512 1
#define K_bli_dgemv_t_zen_int_32x5m_avx512 1
#define K_bli_dgemv_t_zen_int_32x4m_avx512 1
#define K_bli_dgemv_t_zen_int_32x3m_avx512 1
#define K_bli_dgemv_t_zen_int_32x2m_avx512 1
#define K_bli_dgemv_t_zen_int_32x1m_avx512 1
#define K_bli_dgemv_t_zen_int 1
#define K_bli_dgemv_t_zen_int_16x7m 1
#define K_bli_dgemv_t_zen_int_16x6m 1
#define K_bli_dgemv_t_zen_int_16x5m 1
#define K_bli_dgemv_t_zen_int_16x4m 1
#define K_bli_dgemv_t_zen_int_16x3m 1
#define K_bli_dgemv_t_zen_int_16x2m 1
#define K_bli_dgemv_t_zen_int_16x1m 1
#define K_bli_dgemv_t_zen4_int 1
#define K_bli_dgemv_t_zen4_int_32x7m 1
#define K_bli_dgemv_t_zen4_int_32x6m 1
#define K_bli_dgemv_t_zen4_int_32x5m 1
#define K_bli_dgemv_t_zen4_int_32x4m 1
#define K_bli_dgemv_t_zen4_int_32x3m 1
#define K_bli_dgemv_t_zen4_int_32x2m 1
#define K_bli_dgemv_t_zen4_int_32x1m 1
#define K_bli_ztrsm_small_ZEN5 1
#define K_bli_dgemv_n_zen_int_16mx8_avx512 1
#define K_bli_dgemv_n_zen_int_16mx7_avx512 1

View File

@@ -112,12 +112,12 @@ TEST_P( dgemvGeneric, UKR )
#if defined(BLIS_KERNELS_ZEN) && defined(GTEST_AVX2FMA3)
// Unit-tests
#ifdef K_bli_dgemv_t_zen_int_avx2
#ifdef K_bli_dgemv_t_zen_int
INSTANTIATE_TEST_SUITE_P(
dgemv_t_primary_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_avx2),
::testing::Values(bli_dgemv_t_zen_int),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -133,7 +133,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Range( gtint_t(1), gtint_t(16), gtint_t(1)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x (non-unit incx is handled by frame)
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -143,12 +143,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x7m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x7m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx7_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x7m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x7m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -164,7 +164,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(7)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -174,12 +174,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x6m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x6m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx6_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x6m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x6m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -195,7 +195,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(6)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -205,12 +205,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x5m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x5m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx5_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x5m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x5m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -226,7 +226,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(5)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -236,12 +236,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x4m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x4m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx4_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x4m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x4m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -257,7 +257,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(4)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -267,12 +267,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x3m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x3m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx3_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x3m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x3m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -288,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(3)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -298,12 +298,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x2m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x2m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx2_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x2m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x2m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -319,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(2)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -329,12 +329,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_16x1m_avx2
#ifdef K_bli_dgemv_t_zen_int_16x1m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx1_zen,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_16x1m_avx2),
::testing::Values(bli_dgemv_t_zen_int_16x1m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -350,7 +350,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(95)), // 5 * L16 + L8 + L4 + Lfringe
::testing::Values( gtint_t(1)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -363,12 +363,12 @@ INSTANTIATE_TEST_SUITE_P(
#if defined(BLIS_KERNELS_ZEN4) && defined(GTEST_AVX512)
// Unit-tests
#ifdef K_bli_dgemv_t_zen_int_avx512
#ifdef K_bli_dgemv_t_zen4_int
INSTANTIATE_TEST_SUITE_P(
dgemv_t_primary_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_avx512),
::testing::Values(bli_dgemv_t_zen4_int),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -384,7 +384,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Range( gtint_t(1), gtint_t(16), gtint_t(1)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x (non-unit incx is handled by frame)
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -394,12 +394,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x7m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x7m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx7_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x7m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x7m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -415,7 +415,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(7)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -425,12 +425,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x6m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x6m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx6_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x6m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x6m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -446,7 +446,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(6)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -456,12 +456,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x5m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x5m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx5_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x5m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x5m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -477,7 +477,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(5)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -487,12 +487,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x4m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x4m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx4_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x4m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x4m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -508,7 +508,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(4)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -518,12 +518,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x3m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x3m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx3_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x3m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x3m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -539,7 +539,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(3)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -549,12 +549,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x2m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x2m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx2_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x2m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x2m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -570,7 +570,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(2)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a
@@ -580,12 +580,12 @@ INSTANTIATE_TEST_SUITE_P(
);
#endif
#ifdef K_bli_dgemv_t_zen_int_32x1m_avx512
#ifdef K_bli_dgemv_t_zen4_int_32x1m
INSTANTIATE_TEST_SUITE_P(
dgemv_t_mx1_zen4,
dgemvGeneric,
::testing::Combine(
::testing::Values(bli_dgemv_t_zen_int_32x1m_avx512),
::testing::Values(bli_dgemv_t_zen4_int_32x1m),
::testing::Values('c'), // storage format
::testing::Values('t'), // transa
::testing::Values('n'), // conjx
@@ -601,7 +601,7 @@ INSTANTIATE_TEST_SUITE_P(
gtint_t(191)), // 5 * L32 + L16 + L8 + Lfringe
::testing::Values( gtint_t(1)), // n
::testing::Values( double(0.0), double(1.0), double(2.0)), // alpha
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values( double(0.0), double(1.0), double(2.0)), // beta
::testing::Values(gtint_t(1)), // stride size for x
::testing::Values(gtint_t(1)), // stride size for y
::testing::Values(gtint_t(0), gtint_t(7)), // increment to the leading dim of a

File diff suppressed because it is too large Load Diff

View File

@@ -160,14 +160,14 @@ GEMV_KER_PROT( scomplex, c, gemv_zen_int_4x4 )
GEMV_KER_PROT( dcomplex, z, gemv_zen_int_4x4 )
// gemv (intrinsics)
GEMV_KER_PROT( double, d, gemv_t_zen_int_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x7m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x6m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x5m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x4m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x3m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x2m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x1m_avx2 )
GEMV_KER_PROT( double, d, gemv_t_zen_int )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x7m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x6m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x5m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x4m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x3m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x2m )
GEMV_KER_PROT( double, d, gemv_t_zen_int_16x1m )
// her (intrinsics)
HER_KER_PROT( dcomplex, z, her_zen_int_var1 )

File diff suppressed because it is too large Load Diff

View File

@@ -146,14 +146,14 @@ GEMV_KER_PROT( double, d, gemv_n_zen_int_8x1n_avx512 )
GEMV_KER_PROT( double, d, gemv_n_zen_int_m_leftx1n_avx512 )
// dgemv_t kernels for handling op(A) = 't', i.e., transa = 't' cases.
GEMV_KER_PROT( double, d, gemv_t_zen_int_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x7m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x6m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x5m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x4m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x3m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x2m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen_int_32x1m_avx512 )
GEMV_KER_PROT( double, d, gemv_t_zen4_int )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x7m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x6m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x5m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x4m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x3m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x2m )
GEMV_KER_PROT( double, d, gemv_t_zen4_int_32x1m )
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_16x14)
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_16x14)