Optimised dotv kernel by SIMD approach and by removing framework overhead

Details:
    - Kernel is called directly from API call to avoid framework overhead in case of complex float and complex double precisions.
    - Added SIMD code for complex float and complex double and unrolled for loop 5 times to improve performance

AMD-Internal: [CPUPL-1057]

Change-Id: I3b9d202398cacc0168882c9d6da2b450c27466a0
This commit is contained in:
managalv
2020-10-06 20:08:12 +05:30
committed by Dipal M Zambare
parent 1c6cf5c891
commit 5716dd8cf9
7 changed files with 1383 additions and 457 deletions

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
18,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -99,6 +99,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -88,7 +88,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
18,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -103,6 +103,8 @@ void bli_cntx_init_zen2( cntx_t* cntx )
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -88,7 +88,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
18,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
@@ -103,6 +103,8 @@ void bli_cntx_init_zen3( cntx_t* cntx )
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -49,41 +49,41 @@ ftype PASTEF772(ch,blasname,chc) \
const ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return rho; \
return rho; \
}
#ifdef BLIS_ENABLE_BLAS
@@ -96,41 +96,120 @@ dcomplex zdotc_
const dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
/* Initialize BLIS. */
bli_init_auto();
/* Initialize BLIS. */
bli_init_auto();
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );
/* Call BLIS interface. */
PASTEMAC2(z,dotv,_ex)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
/* Call BLIS interface. */
PASTEMAC2(z,dotv,_ex)
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL,
NULL
);
/* Finalize BLIS. */
bli_finalize_auto();
/* Finalize BLIS. */
bli_finalize_auto();
*ret_val = rho;
return rho;
return rho;
}
#else
dcomplex zdotc_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
}
#endif
@@ -157,8 +236,8 @@ float sdot_
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
@@ -197,22 +276,22 @@ float sdot_
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_sdotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Call BLIS kernel. */
bli_sdotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
return rho;
}
double ddot_
@@ -222,22 +301,22 @@ double ddot_
const double* y, const f77_int* incy
)
{
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
double rho;
dim_t n0;
double* x0;
double* y0;
inc_t incx0;
inc_t incy0;
double rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
@@ -276,28 +355,261 @@ double ddot_
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_ddotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Call BLIS kernel. */
bli_ddotv_zen_int10
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
return rho;
}
scomplex cdotu_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
}
dcomplex zdotu_
(
const f77_int* n,
const dcomplex* x, const f77_int* incx,
const dcomplex* y, const f77_int* incy
)
{
dim_t n0;
dcomplex* x0;
dcomplex* y0;
inc_t incx0;
inc_t incy0;
dcomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((dcomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((dcomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_zdotv_zen_int5
(
BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
}
scomplex cdotc_
(
const f77_int* n,
const scomplex* x, const f77_int* incx,
const scomplex* y, const f77_int* incy
)
{
dim_t n0;
scomplex* x0;
scomplex* y0;
inc_t incx0;
inc_t incy0;
scomplex rho;
/* Initialize BLIS. */
// bli_init_auto();
/* Convert/typecast negative values of n to zero. */
if ( *n < 0 ) n0 = ( dim_t )0;
else n0 = ( dim_t )(*n);
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
if ( *incx < 0 )
{
/* The semantics of negative stride in BLAS are that the vector
operand be traversed in reverse order. (Another way to think
of this is that negative strides effectively reverse the order
of the vector, but without any explicit data movements.) This
is also how BLIS interprets negative strides. The differences
is that with BLAS, the caller *always* passes in the 0th (i.e.,
top-most or left-most) element of the vector, even when the
stride is negative. By contrast, in BLIS, negative strides are
used *relative* to the vector address as it is given. Thus, in
BLIS, if this backwards traversal is desired, the caller *must*
pass in the address to the (n-1)th (i.e., the bottom-most or
right-most) element along with a negative stride. */
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
incx0 = ( inc_t )(*incx);
}
else
{
x0 = ((scomplex*)x);
incx0 = ( inc_t )(*incx);
}
if ( *incy < 0 )
{
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
incy0 = ( inc_t )(*incy);
}
else
{
y0 = ((scomplex*)y);
incy0 = ( inc_t )(*incy);
}
/* Call BLIS kernel. */
bli_cdotv_zen_int5
(
BLIS_CONJUGATE,
BLIS_NO_CONJUGATE,
n0,
x0, incx0,
y0, incy0,
&rho,
NULL
);
/* Finalize BLIS. */
// bli_finalize_auto();
return rho;
}
#ifdef AOCL_F2C
INSERT_GENTFUNCDOT_BLAS_CZ_F2C( dot, dotv)
#else
INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
#endif
#else
#ifdef AOCL_F2C
INSERT_GENTFUNCDOT_BLAS_SDC( dot, dotv )
@@ -318,16 +630,16 @@ float PASTEF77(sd,sdot)
const float* y, const f77_int* incy
)
{
return ( float )
(
( double )(*sb) +
PASTEF77(d,sdot)
(
n,
x, incx,
y, incy
)
);
return ( float )
(
( double )(*sb) +
PASTEF77(d,sdot)
(
n,
x, incx,
y, incy
)
);
}
// Input vectors stored in single precision, computed in double precision,
@@ -339,39 +651,39 @@ double PASTEF77(d,sdot)
const float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
double rho;
dim_t i;
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
double rho;
dim_t i;
/* Initialization of BLIS is not required. */
/* Initialization of BLIS is not required. */
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
rho = 0.0;
rho = 0.0;
for ( i = 0; i < n0; i++ )
{
float* chi1 = x0 + (i )*incx0;
float* psi1 = y0 + (i )*incy0;
for ( i = 0; i < n0; i++ )
{
float* chi1 = x0 + (i )*incx0;
float* psi1 = y0 + (i )*incy0;
bli_ddots( (( double )(*chi1)),
(( double )(*psi1)), rho );
}
bli_ddots( (( double )(*chi1)),
(( double )(*psi1)), rho );
}
/* Finalization of BLIS is not required, because initialization was
not required. */
/* Finalization of BLIS is not required, because initialization was
not required. */
return rho;
return rho;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -58,9 +58,11 @@ AXPYV_KER_PROT( double, d, axpyv_zen_int )
DOTV_KER_PROT( float, s, dotv_zen_int )
DOTV_KER_PROT( double, d, dotv_zen_int )
// dotv (intrinsics, unrolled x10)
DOTV_KER_PROT( float, s, dotv_zen_int10 )
DOTV_KER_PROT( double, d, dotv_zen_int10 )
// dotv (intrinsics, unrolled x10)
DOTV_KER_PROT( float, s, dotv_zen_int10 )
DOTV_KER_PROT( double, d, dotv_zen_int10 )
DOTV_KER_PROT( scomplex, c, dotv_zen_int5 )
DOTV_KER_PROT( dcomplex, z, dotv_zen_int5 )
// dotxv (intrinsics)
DOTXV_KER_PROT( float, s, dotxv_zen_int )

4
kernels/zen2/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore