mirror of
https://github.com/amd/blis.git
synced 2026-05-24 10:24:34 +00:00
AVX2 and AVX512 optimizations for DAXPYV
- Removed some of the unrolling factors that affected the performance of AVX2 DAXPYV kernel. In addition to improving the current performance on sizes compatible to single-threaded runs, this will now perform better for tiny sizes as well since the overhead to reach the computation is less. - Updated the vector partitioning logic, by using bli_thread_range_sub( ... ), which ensures that there is no false sharing among multiple threads. - Updated the AOCL-DYNAMIC logic for the API, to include thresholds or zen4 and zen5 micro-architectures. AMD-Internal: [CPUPL-5514] Change-Id: Iee9edddac685334213cd6694421ab3df3547e930
This commit is contained in:
@@ -1779,7 +1779,45 @@ BLIS_INLINE void aocl_daxpyv_dynamic
|
||||
switch (arch_id)
|
||||
{
|
||||
case BLIS_ARCH_ZEN5:
|
||||
|
||||
if ( n_elem <= 34000 )
|
||||
*nt_ideal = 1;
|
||||
else if ( n_elem <= 82000 )
|
||||
*nt_ideal = 4;
|
||||
else if ( n_elem <= 2330000 )
|
||||
*nt_ideal = 8;
|
||||
else if ( n_elem <= 4250000 )
|
||||
*nt_ideal = 16;
|
||||
else if ( n_elem <= 7000000 )
|
||||
*nt_ideal = 32;
|
||||
else if ( n_elem <= 21300000 )
|
||||
*nt_ideal = 64;
|
||||
else
|
||||
// For sizes in this range, AOCL dynamic does not make any change
|
||||
*nt_ideal = -1;
|
||||
|
||||
break;
|
||||
|
||||
case BLIS_ARCH_ZEN4:
|
||||
|
||||
if ( n_elem <= 11000 )
|
||||
*nt_ideal = 1;
|
||||
else if ( n_elem <= 130000 )
|
||||
*nt_ideal = 4;
|
||||
else if ( n_elem <= 2230000 )
|
||||
*nt_ideal = 8;
|
||||
else if ( n_elem <= 3400000 )
|
||||
*nt_ideal = 16;
|
||||
else if ( n_elem <= 9250000 )
|
||||
*nt_ideal = 32;
|
||||
else if ( n_elem <= 15800000 )
|
||||
*nt_ideal = 64;
|
||||
else
|
||||
// For sizes in this range, AOCL dynamic does not make any change
|
||||
*nt_ideal = -1;
|
||||
|
||||
break;
|
||||
|
||||
case BLIS_ARCH_ZEN:
|
||||
case BLIS_ARCH_ZEN2:
|
||||
case BLIS_ARCH_ZEN3:
|
||||
|
||||
@@ -397,26 +397,37 @@ void daxpy_blis_impl
|
||||
|
||||
_Pragma("omp parallel num_threads(nt)")
|
||||
{
|
||||
dim_t start, length;
|
||||
dim_t start, end, length;
|
||||
thrinfo_t thrinfo_vec;
|
||||
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
// The block size is the minimum factor, whose multiple will ensure that only
|
||||
// the vector code section is executed. Furthermore, for double datatype it corresponds
|
||||
// to one cacheline size.
|
||||
dim_t block_size = 8;
|
||||
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
thrinfo_vec.n_way = omp_get_num_threads();
|
||||
|
||||
// Get the thread ID
|
||||
thrinfo_vec.work_id = omp_get_thread_num();
|
||||
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
|
||||
bli_thread_range_sub
|
||||
(
|
||||
&thrinfo_vec,
|
||||
n_elem,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
block_size,
|
||||
FALSE,
|
||||
&start,
|
||||
&end
|
||||
);
|
||||
|
||||
length = end - start;
|
||||
|
||||
// Adjust the local pointer for computation
|
||||
double *x_thread_local = x0 + (start * incx0);
|
||||
double *y_thread_local = y0 + (start * incy0);
|
||||
|
||||
@@ -360,9 +360,9 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
double* restrict y0 = y;
|
||||
|
||||
__m256d alphav;
|
||||
__m256d xv[13];
|
||||
__m256d yv[13];
|
||||
__m256d zv[13];
|
||||
__m256d xv[4];
|
||||
__m256d yv[4];
|
||||
__m256d zv[4];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) )
|
||||
@@ -380,151 +380,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_sd( alpha );
|
||||
|
||||
for (i = 0; (i + 51) < n; i += 52)
|
||||
{
|
||||
// 52 elements will be processed per loop; 13 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_pd(x0 + 0 * n_elem_per_reg);
|
||||
xv[1] = _mm256_loadu_pd(x0 + 1 * n_elem_per_reg);
|
||||
xv[2] = _mm256_loadu_pd(x0 + 2 * n_elem_per_reg);
|
||||
xv[3] = _mm256_loadu_pd(x0 + 3 * n_elem_per_reg);
|
||||
xv[4] = _mm256_loadu_pd(x0 + 4 * n_elem_per_reg);
|
||||
xv[5] = _mm256_loadu_pd(x0 + 5 * n_elem_per_reg);
|
||||
xv[6] = _mm256_loadu_pd(x0 + 6 * n_elem_per_reg);
|
||||
xv[7] = _mm256_loadu_pd(x0 + 7 * n_elem_per_reg);
|
||||
xv[8] = _mm256_loadu_pd(x0 + 8 * n_elem_per_reg);
|
||||
xv[9] = _mm256_loadu_pd(x0 + 9 * n_elem_per_reg);
|
||||
xv[10] = _mm256_loadu_pd(x0 + 10 * n_elem_per_reg);
|
||||
xv[11] = _mm256_loadu_pd(x0 + 11 * n_elem_per_reg);
|
||||
xv[12] = _mm256_loadu_pd(x0 + 12 * n_elem_per_reg);
|
||||
|
||||
yv[0] = _mm256_loadu_pd(y0 + 0 * n_elem_per_reg);
|
||||
yv[1] = _mm256_loadu_pd(y0 + 1 * n_elem_per_reg);
|
||||
yv[2] = _mm256_loadu_pd(y0 + 2 * n_elem_per_reg);
|
||||
yv[3] = _mm256_loadu_pd(y0 + 3 * n_elem_per_reg);
|
||||
yv[4] = _mm256_loadu_pd(y0 + 4 * n_elem_per_reg);
|
||||
yv[5] = _mm256_loadu_pd(y0 + 5 * n_elem_per_reg);
|
||||
yv[6] = _mm256_loadu_pd(y0 + 6 * n_elem_per_reg);
|
||||
yv[7] = _mm256_loadu_pd(y0 + 7 * n_elem_per_reg);
|
||||
yv[8] = _mm256_loadu_pd(y0 + 8 * n_elem_per_reg);
|
||||
yv[9] = _mm256_loadu_pd(y0 + 9 * n_elem_per_reg);
|
||||
yv[10] = _mm256_loadu_pd(y0 + 10 * n_elem_per_reg);
|
||||
yv[11] = _mm256_loadu_pd(y0 + 11 * n_elem_per_reg);
|
||||
yv[12] = _mm256_loadu_pd(y0 + 12 * n_elem_per_reg);
|
||||
|
||||
zv[0] = _mm256_fmadd_pd(xv[0], alphav, yv[0]);
|
||||
zv[1] = _mm256_fmadd_pd(xv[1], alphav, yv[1]);
|
||||
zv[2] = _mm256_fmadd_pd(xv[2], alphav, yv[2]);
|
||||
zv[3] = _mm256_fmadd_pd(xv[3], alphav, yv[3]);
|
||||
zv[4] = _mm256_fmadd_pd(xv[4], alphav, yv[4]);
|
||||
zv[5] = _mm256_fmadd_pd(xv[5], alphav, yv[5]);
|
||||
zv[6] = _mm256_fmadd_pd(xv[6], alphav, yv[6]);
|
||||
zv[7] = _mm256_fmadd_pd(xv[7], alphav, yv[7]);
|
||||
zv[8] = _mm256_fmadd_pd(xv[8], alphav, yv[8]);
|
||||
zv[9] = _mm256_fmadd_pd(xv[9], alphav, yv[9]);
|
||||
zv[10] = _mm256_fmadd_pd(xv[10], alphav, yv[10]);
|
||||
zv[11] = _mm256_fmadd_pd(xv[11], alphav, yv[11]);
|
||||
zv[12] = _mm256_fmadd_pd(xv[12], alphav, yv[12]);
|
||||
|
||||
_mm256_storeu_pd((y0 + 0 * n_elem_per_reg), zv[0]);
|
||||
_mm256_storeu_pd((y0 + 1 * n_elem_per_reg), zv[1]);
|
||||
_mm256_storeu_pd((y0 + 2 * n_elem_per_reg), zv[2]);
|
||||
_mm256_storeu_pd((y0 + 3 * n_elem_per_reg), zv[3]);
|
||||
_mm256_storeu_pd((y0 + 4 * n_elem_per_reg), zv[4]);
|
||||
_mm256_storeu_pd((y0 + 5 * n_elem_per_reg), zv[5]);
|
||||
_mm256_storeu_pd((y0 + 6 * n_elem_per_reg), zv[6]);
|
||||
_mm256_storeu_pd((y0 + 7 * n_elem_per_reg), zv[7]);
|
||||
_mm256_storeu_pd((y0 + 8 * n_elem_per_reg), zv[8]);
|
||||
_mm256_storeu_pd((y0 + 9 * n_elem_per_reg), zv[9]);
|
||||
_mm256_storeu_pd((y0 + 10 * n_elem_per_reg), zv[10]);
|
||||
_mm256_storeu_pd((y0 + 11 * n_elem_per_reg), zv[11]);
|
||||
_mm256_storeu_pd((y0 + 12 * n_elem_per_reg), zv[12]);
|
||||
|
||||
x0 += 13 * n_elem_per_reg;
|
||||
y0 += 13 * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 39) < n; i += 40 )
|
||||
{
|
||||
// 40 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
|
||||
yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
|
||||
yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
|
||||
zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] );
|
||||
zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] );
|
||||
zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] );
|
||||
zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
|
||||
zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
|
||||
_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
|
||||
_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
|
||||
_mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
|
||||
_mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 19) < n; i += 20 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
for ( i = 0; ( i + 15 ) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -550,7 +406,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; i + 7 < n; i += 8 )
|
||||
for ( ; ( i + 7 ) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
@@ -568,7 +424,7 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen_int10
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; i + 3 < n; i += 4 )
|
||||
for ( ; ( i + 3 ) < n; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
|
||||
Reference in New Issue
Block a user