Multithreaded SGEMV var 1 with smart threading

- Implemented an OpenMP based stand alone SGEMV kernel for
	  row-major (var 1) for multithread scenarios
	- Smart threading is enabled when AOCL DYNAMIC is defined
	- Number of threads are decided based on the input dims
	  using smart threading

AMD-Internal: [CPUPL-1984]
Change-Id: I9b191e965ba7468e95aabcce21b35a533017502e
This commit is contained in:
S, HariharaSudhan
2022-03-29 18:05:59 +05:30
committed by Dipal M Zambare
parent 16de63c818
commit a8bc55c373
2 changed files with 522 additions and 1 deletions

View File

@@ -332,6 +332,92 @@ void bli_dgemv_unf_var1
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
// Returns the optimal number of threads for the given input sizes and fuse factor
void bli_sgemv_var1_smart_threading
(
dim_t m, dim_t n,
dim_t fuse,
dim_t* nt, dim_t nt_max
)
{
// Calculate the amount data processed per iteration
dim_t n_per_loop = n / fuse;
double data_per_iter = n_per_loop* m;
double m_n_ratio = m/n;
// When the input value is less than the fuse factor
if(n_per_loop < 1)
{
*nt = 1;
return;
}
// Then there are two cases one
// In m < n the thread spawning is less aggressive when compared to m > n and m = n cases
if(m_n_ratio <= 0.6)
{
// Boundary units is the amount of data processed by each iteration
// This is the variable X in the equation
const double lower_boundary = 50000;
const double higher_boundary = 500000;
if(data_per_iter < lower_boundary)
{
double coeff_x = 0.9148;
double constant = -1.6252;
// Number of threads = 0.9148 * log(x) - 1.6252
*nt = ceil(coeff_x * log(data_per_iter) + constant);
}
else if(data_per_iter < higher_boundary)
{
float coeff_x = 10.23;
float constant = -82.332;
// Number of threads = 10.23 * log(x) - 82.332
*nt = ceil(coeff_x * log(data_per_iter) + constant);
}
else
{
// When the amount of data to be processed is above both of the boundaries
// The number of threads spawned will be equal to the max number of threads set
*nt = nt_max;
}
}
else
{
// Boundary units is the amount of data processed by each iteration
// This is the variable X in the equation
const float lower_boundary = 50000;
const float higher_boundary = 360000;
if(data_per_iter < lower_boundary)
{
float coeff_x2 = -2E-09;
float coeff_x = 0.0002;
float constant = 1.0234;
// Number of threads = -2E-09*x^2 + 0.0002 * x + 1.0234
*nt = ceil(coeff_x2 * (data_per_iter * data_per_iter) + coeff_x * data_per_iter + constant);
}
else if(data_per_iter < higher_boundary)
{
float coeff_x = 16.917;
float constant = -164.82;
// Number of threads = 16.917 * log(x) - 164.82
*nt = ceil(coeff_x * log(data_per_iter) + constant);
}
else
{
// When the amount of data to be processed is above both of the boundaries
// The number of threads spawned will be equal to the max number of threads set
*nt = nt_max;
}
}
// When the number of threads calculated is greater than the user provided value
// Choose the user provided value
if(*nt > nt_max)
*nt = nt_max;
}
void bli_sgemv_unf_var1
(
trans_t transa,
@@ -407,7 +493,46 @@ void bli_sgemv_unf_var1
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
// If both multithreading and OpenMP are enabled, GEMV will multithread
#if defined(BLIS_ENABLE_MULTITHREADING) && defined(BLIS_ENABLE_OPENMP)
dim_t nt, nt_max;
rntm_t rnmt_obj;
b_fuse = 4;
// Initialize a local runtime with global settings.
bli_rntm_init_from_global( &rnmt_obj );
// Query the total number of threads from the rntm_t object.
nt_max = bli_rntm_num_threads( &rnmt_obj );
//Setting the thread count to the maximum number of threads provided
nt = nt_max;
// Enable smart threading when AOCL dynamic is enabled
#ifdef AOCL_DYNAMIC
bli_sgemv_var1_smart_threading(n_elem, n_iter, b_fuse, &nt, nt_max);
#endif
// Pass the input paramaters along with the number of threads to be used
bli_multi_sgemv_4x2
(
conja,
conjx,
n_elem,
n_iter,
alpha,
a, cs_at, rs_at,
x, incx,
beta,
y, incy,
cntx,
nt
);
#else
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
@@ -434,6 +559,7 @@ void bli_sgemv_unf_var1
);
}
#endif
}
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )

View File

@@ -35,6 +35,24 @@
#include "immintrin.h"
#include "blis.h"
/* Union data structure to access AVX registers
One 256-bit AVX register holds 8 SP elements. */
typedef union
{
__m256 v;
float f[8] __attribute__((aligned(64)));
} v8sf_t;
/* Union data structure to access AVX registers
* One 128-bit AVX register holds 4 SP elements. */
typedef union
{
__m128 v;
float f[4] __attribute__((aligned(64)));
} v4sf_t;
/*
This implementation uses 512 bits of cache line efficiently for
column stored matrix and vectors.
@@ -477,3 +495,380 @@ void bli_cgemv_zen_int_4x4
}
}
/*
Function performs multithreaded GEMV for float datatype
All parameters are similar to single thread GEMV except
n_thread which specifies the number of threads to be used
*/
void bli_multi_sgemv_4x2
(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict beta,
float* restrict y, inc_t incy,
cntx_t* restrict cntx,
dim_t n_threads
)
{
const dim_t b_fuse = 4;
const dim_t n_elem_per_reg = 8;
dim_t total_iteration = 0;
// If the b_n dimension is zero, y is empty and there is no computation.
if (bli_zero_dim1(b_n))
return;
// If the m dimension is zero, or if alpha is zero, the computation
// simplifies to updating y.
if (bli_zero_dim1(m) || PASTEMAC(s, eq0)(*alpha))
{
bli_sscalv_zen_int10(
BLIS_NO_CONJUGATE,
b_n,
beta,
y, incy,
cntx);
return;
}
// If b_n is not equal to the fusing factor, then perform the entire
// operation as a loop over dotxv.
if (b_n < b_fuse)
{
for (dim_t i = 0; i < b_n; ++i)
{
float *a1 = a + (0) * inca + (i)*lda;
float *x1 = x + (0) * incx;
float *psi1 = y + (i)*incy;
bli_sdotxv_zen_int(
conjat,
conjx,
m,
alpha,
a1, inca,
x1, incx,
beta,
psi1,
cntx);
}
return;
}
// Calculate the total number of multithreaded iteration
total_iteration = b_n / b_fuse;
#pragma omp parallel for num_threads(n_threads)
for (dim_t j = 0; j < total_iteration; j++)
{
float *A1 = a + (b_fuse * j) * lda;
float *x1 = x;
float *y1 = y + (b_fuse * j) * incy;
// Intermediate variables to hold the completed dot products
float rho0[4] = {0, 0, 0, 0};
// If vectorization is possible, perform them with vector
// instructions.
if (inca == 1 && incx == 1)
{
const dim_t n_iter_unroll = 2;
// Use the unrolling factor and the number of elements per register
// to compute the number of vectorized and leftover iterations.
dim_t l, unroll_inc, m_viter[2], m_left = m;
unroll_inc = n_elem_per_reg * n_iter_unroll;
m_viter[0] = m_left / unroll_inc;
m_left = m_left % unroll_inc;
m_viter[1] = m_left / n_elem_per_reg ;
m_left = m_left % n_elem_per_reg;
// Set up pointers for x and the b_n columns of A (rows of A^T).
float *restrict x0 = x1;
float *restrict av[4];
av[0] = A1 + 0 * lda;
av[1] = A1 + 1 * lda;
av[2] = A1 + 2 * lda;
av[3] = A1 + 3 * lda;
// Initialize b_n rho vector accumulators to zero.
v8sf_t rhov[4];
rhov[0].v = _mm256_setzero_ps();
rhov[1].v = _mm256_setzero_ps();
rhov[2].v = _mm256_setzero_ps();
rhov[3].v = _mm256_setzero_ps();
v8sf_t xv[2];
v8sf_t a_vec[8];
// FMA operation is broken down to mul and add
// to reduce backend stalls
for (l = 0; l < m_viter[0]; ++l)
{
xv[0].v = _mm256_loadu_ps(x0);
x0 += n_elem_per_reg;
xv[1].v = _mm256_loadu_ps(x0);
x0 += n_elem_per_reg;
a_vec[0].v = _mm256_loadu_ps(av[0]);
a_vec[4].v = _mm256_loadu_ps(av[0] + n_elem_per_reg);
// perform: rho?v += a?v * x0v;
a_vec[0].v = _mm256_mul_ps(a_vec[0].v, xv[0].v);
rhov[0].v = _mm256_fmadd_ps(a_vec[4].v, xv[1].v, rhov[0].v);
rhov[0].v = _mm256_add_ps(a_vec[0].v, rhov[0].v);
a_vec[1].v = _mm256_loadu_ps(av[1]);
a_vec[5].v = _mm256_loadu_ps(av[1] + n_elem_per_reg);
a_vec[1].v = _mm256_mul_ps(a_vec[1].v, xv[0].v);
rhov[1].v = _mm256_fmadd_ps(a_vec[5].v, xv[1].v, rhov[1].v);
rhov[1].v = _mm256_add_ps(a_vec[1].v, rhov[1].v);
a_vec[2].v = _mm256_loadu_ps(av[2]);
a_vec[6].v = _mm256_loadu_ps(av[2] + n_elem_per_reg);
a_vec[2].v = _mm256_mul_ps(a_vec[2].v, xv[0].v);
rhov[2].v = _mm256_fmadd_ps(a_vec[6].v, xv[1].v, rhov[2].v);
rhov[2].v = _mm256_add_ps(a_vec[2].v, rhov[2].v);
a_vec[3].v = _mm256_loadu_ps(av[3]);
a_vec[7].v = _mm256_loadu_ps(av[3] + n_elem_per_reg);
a_vec[3].v = _mm256_mul_ps(a_vec[3].v, xv[0].v);
rhov[3].v = _mm256_fmadd_ps(a_vec[7].v, xv[1].v, rhov[3].v);
rhov[3].v = _mm256_add_ps(a_vec[3].v, rhov[3].v);
av[0] += unroll_inc;
av[1] += unroll_inc;
av[2] += unroll_inc;
av[3] += unroll_inc;
}
for (l = 0; l < m_viter[1]; ++l)
{
// Load the input values.
xv[0].v = _mm256_loadu_ps(x0);
x0 += n_elem_per_reg;
a_vec[0].v = _mm256_loadu_ps(av[0]);
a_vec[1].v = _mm256_loadu_ps(av[1]);
rhov[0].v = _mm256_fmadd_ps(a_vec[0].v, xv[0].v, rhov[0].v);
rhov[1].v = _mm256_fmadd_ps(a_vec[1].v, xv[0].v, rhov[1].v);
av[0] += n_elem_per_reg;
av[1] += n_elem_per_reg;
a_vec[2].v = _mm256_loadu_ps(av[2]);
a_vec[3].v = _mm256_loadu_ps(av[3]);
rhov[2].v = _mm256_fmadd_ps(a_vec[2].v, xv[0].v, rhov[2].v);
rhov[3].v = _mm256_fmadd_ps(a_vec[3].v, xv[0].v, rhov[3].v);
av[2] += n_elem_per_reg;
av[3] += n_elem_per_reg;
}
// Sum the elements within each vector.
// Sum the elements of a given rho?v with hadd.
rhov[0].v = _mm256_hadd_ps(rhov[0].v, rhov[1].v);
rhov[2].v = _mm256_hadd_ps(rhov[2].v, rhov[3].v);
rhov[0].v = _mm256_hadd_ps(rhov[0].v, rhov[0].v);
rhov[2].v = _mm256_hadd_ps(rhov[2].v, rhov[2].v);
// Manually add the results from above to finish the sum.
rho0[0] = rhov[0].f[0] + rhov[0].f[4];
rho0[1] = rhov[0].f[1] + rhov[0].f[5];
rho0[2] = rhov[2].f[0] + rhov[2].f[4];
rho0[3] = rhov[2].f[1] + rhov[2].f[5];
// If leftover elements are more than 4, perform SSE
if (m_left > 4)
{
v4sf_t xv128, a_vec128[4], rhov128[4];
rhov128[0].v = _mm_set1_ps(0);
rhov128[1].v = _mm_set1_ps(0);
rhov128[2].v = _mm_set1_ps(0);
rhov128[3].v = _mm_set1_ps(0);
// Load the input values.
xv128.v = _mm_loadu_ps(x0 + 0 * n_elem_per_reg);
x0 += 4;
m_left -= 4;
a_vec128[0].v = _mm_loadu_ps(av[0]);
a_vec128[1].v = _mm_loadu_ps(av[1]);
// perform: rho?v += a?v * x0v;
rhov128[0].v = _mm_fmadd_ps(a_vec128[0].v, xv128.v, rhov128[0].v);
rhov128[1].v = _mm_fmadd_ps(a_vec128[1].v, xv128.v, rhov128[1].v);
rhov128[0].v = _mm_hadd_ps(rhov128[0].v, rhov128[1].v);
rhov128[0].v = _mm_hadd_ps(rhov128[0].v, rhov128[0].v);
a_vec128[2].v = _mm_loadu_ps(av[2]);
a_vec128[3].v = _mm_loadu_ps(av[3]);
rhov128[2].v = _mm_fmadd_ps(a_vec128[2].v, xv128.v, rhov128[2].v);
rhov128[3].v = _mm_fmadd_ps(a_vec128[3].v, xv128.v, rhov128[3].v);
rhov128[2].v = _mm_hadd_ps(rhov128[2].v, rhov128[3].v);
rhov128[2].v = _mm_hadd_ps(rhov128[2].v, rhov128[2].v);
rho0[0] += rhov128[0].f[0];
rho0[1] += rhov128[0].f[1];
rho0[2] += rhov128[2].f[0];
rho0[3] += rhov128[2].f[1];
av[0] += 4;
av[1] += 4;
av[2] += 4;
av[3] += 4;
}
// If there are leftover iterations, perform them with scalar code.
for (l = 0; l < m_left; ++l)
{
rho0[0] += *(av[0]) * (*x0);
rho0[1] += *(av[1]) * (*x0);
rho0[2] += *(av[2]) * (*x0);
rho0[3] += *(av[3]) * (*x0);
x0 += incx;
av[0] += inca;
av[1] += inca;
av[2] += inca;
av[3] += inca;
}
}
else
{
// When vectorization is not possible, perform with scalar code
// Initialize pointers for x and the b_n columns of A (rows of A^T).
float *restrict x0 = x1;
float *restrict a0 = A1 + 0 * lda;
float *restrict a1 = A1 + 1 * lda;
float *restrict a2 = A1 + 2 * lda;
float *restrict a3 = A1 + 3 * lda;
for (dim_t l = 0; l < m; ++l)
{
const float x0c = *x0;
const float a0c = *a0;
const float a1c = *a1;
const float a2c = *a2;
const float a3c = *a3;
rho0[0] += a0c * x0c;
rho0[1] += a1c * x0c;
rho0[2] += a2c * x0c;
rho0[3] += a3c * x0c;
x0 += incx;
a0 += inca;
a1 += inca;
a2 += inca;
a3 += inca;
}
}
v4sf_t rho0v, y0v;
rho0v.v = _mm_loadu_ps(rho0);
// Broadcast the alpha scalar.
v4sf_t alphav;
alphav.v = _mm_broadcast_ss(alpha);
// We know at this point that alpha is nonzero; however, beta may still
// be zero. If beta is indeed zero, we must overwrite y rather than scale
// by beta (in case y contains NaN or Inf).
if (PASTEMAC(s, eq0)(*beta))
{
// Apply alpha to the accumulated dot product in rho:
// y := alpha * rho
y0v.v = _mm_mul_ps(alphav.v, rho0v.v);
}
else
{
// Broadcast the beta scalar.
v4sf_t betav;
betav.v = _mm_broadcast_ss(beta);
if (incy == 0)
{
// Load y.
y0v.v = _mm_loadu_ps(y1 + 0 * n_elem_per_reg);
}
else
{
// Load y.
y0v.f[0] = *(y1 + 0 * incy);
y0v.f[1] = *(y1 + 1 * incy);
y0v.f[2] = *(y1 + 2 * incy);
y0v.f[3] = *(y1 + 3 * incy);
}
// Apply beta to y and alpha to the accumulated dot product in rho:
// y := beta * y + alpha * rho
y0v.v = _mm_mul_ps(betav.v, y0v.v);
y0v.v = _mm_fmadd_ps(alphav.v, rho0v.v, y0v.v);
}
// Store the output.
if (incy == 1)
{
_mm_storeu_ps((y1 + 0 * n_elem_per_reg), y0v.v);
}
else
{
// Store the output.
*(y1 + 0 * incy) = y0v.f[0];
*(y1 + 1 * incy) = y0v.f[1];
*(y1 + 2 * incy) = y0v.f[2];
*(y1 + 3 * incy) = y0v.f[3];
}
}
// Performs the complete computation if OpenMP is not enabled
dim_t start = total_iteration * b_fuse;
dim_t new_fuse = 8, f;
// Left over corner cases completed using fused kernel
for (dim_t i = start; i < b_n; i += f)
{
f = bli_determine_blocksize_dim_f(i, b_n, new_fuse);
float *A1 = a + (i)*lda + (0) * inca;
float *x1 = x + (0) * incx;
float *y1 = y + (i)*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
bli_sdotxf_zen_int_8(
conjat,
conjx,
m,
f,
alpha,
A1, inca, lda,
x1, incx,
beta,
y1, incy,
cntx);
}
}