mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Optimised dotv kernel by SIMD approach and by removing framework overhead
Details:
- Kernel is called directly from API call to avoid framework overhead in case of complex float and complex double precisions.
- Added SIMD code for complex float and complex double and unrolled for loop 5 times to improve performance
AMD-Internal: [CPUPL-1057]
Change-Id: I3b9d202398cacc0168882c9d6da2b450c27466a0
This commit is contained in:
committed by
Dipal M Zambare
parent
1c6cf5c891
commit
5716dd8cf9
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -82,7 +82,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
16,
|
||||
18,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
@@ -99,6 +99,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -88,7 +88,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
16,
|
||||
18,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
@@ -103,6 +103,8 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -88,7 +88,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
16,
|
||||
18,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
@@ -103,6 +103,8 @@ void bli_cntx_init_zen3( cntx_t* cntx )
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5,
|
||||
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -49,41 +49,41 @@ ftype PASTEF772(ch,blasname,chc) \
|
||||
const ftype* y, const f77_int* incy \
|
||||
) \
|
||||
{ \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
ftype rho; \
|
||||
dim_t n0; \
|
||||
ftype* x0; \
|
||||
ftype* y0; \
|
||||
inc_t incx0; \
|
||||
inc_t incy0; \
|
||||
ftype rho; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
/* Convert/typecast negative values of n to zero. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
\
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */ \
|
||||
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
|
||||
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_conjx, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
&rho, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_conjx, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n0, \
|
||||
x0, incx0, \
|
||||
y0, incy0, \
|
||||
&rho, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
\
|
||||
return rho; \
|
||||
return rho; \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
@@ -96,41 +96,120 @@ dcomplex zdotc_
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
|
||||
bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
bli_convert_blas_incv( n0, (dcomplex*)x, *incx, x0, incx0 );
|
||||
bli_convert_blas_incv( n0, (dcomplex*)y, *incy, y0, incy0 );
|
||||
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,_ex)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
/* Call BLIS interface. */
|
||||
PASTEMAC2(z,dotv,_ex)
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
*ret_val = rho;
|
||||
return rho;
|
||||
return rho;
|
||||
}
|
||||
#else
|
||||
dcomplex zdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -157,8 +236,8 @@ float sdot_
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
@@ -197,22 +276,22 @@ float sdot_
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_sdotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
/* Call BLIS kernel. */
|
||||
bli_sdotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
return rho;
|
||||
}
|
||||
|
||||
double ddot_
|
||||
@@ -222,22 +301,22 @@ double ddot_
|
||||
const double* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
dim_t n0;
|
||||
double* x0;
|
||||
double* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
@@ -276,28 +355,261 @@ double ddot_
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
/* Call BLIS kernel. */
|
||||
bli_ddotv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
return rho;
|
||||
}
|
||||
|
||||
scomplex cdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
dcomplex zdotu_
|
||||
(
|
||||
const f77_int* n,
|
||||
const dcomplex* x, const f77_int* incx,
|
||||
const dcomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
dcomplex* x0;
|
||||
dcomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
dcomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((dcomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((dcomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((dcomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((dcomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_zdotv_zen_int5
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
}
|
||||
|
||||
|
||||
scomplex cdotc_
|
||||
(
|
||||
const f77_int* n,
|
||||
const scomplex* x, const f77_int* incx,
|
||||
const scomplex* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
scomplex* x0;
|
||||
scomplex* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
scomplex rho;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
// bli_init_auto();
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
if ( *n < 0 ) n0 = ( dim_t )0;
|
||||
else n0 = ( dim_t )(*n);
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
|
||||
if ( *incx < 0 )
|
||||
{
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */
|
||||
|
||||
x0 = ((scomplex*)x) + (n0-1)*(-*incx);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
x0 = ((scomplex*)x);
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
if ( *incy < 0 )
|
||||
{
|
||||
y0 = ((scomplex*)y) + (n0-1)*(-*incy);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((scomplex*)y);
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
/* Call BLIS kernel. */
|
||||
bli_cdotv_zen_int5
|
||||
(
|
||||
BLIS_CONJUGATE,
|
||||
BLIS_NO_CONJUGATE,
|
||||
n0,
|
||||
x0, incx0,
|
||||
y0, incy0,
|
||||
&rho,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// bli_finalize_auto();
|
||||
|
||||
return rho;
|
||||
}
|
||||
#ifdef AOCL_F2C
|
||||
INSERT_GENTFUNCDOT_BLAS_CZ_F2C( dot, dotv)
|
||||
#else
|
||||
INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv )
|
||||
#endif
|
||||
#else
|
||||
#ifdef AOCL_F2C
|
||||
INSERT_GENTFUNCDOT_BLAS_SDC( dot, dotv )
|
||||
@@ -318,16 +630,16 @@ float PASTEF77(sd,sdot)
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
return ( float )
|
||||
(
|
||||
( double )(*sb) +
|
||||
PASTEF77(d,sdot)
|
||||
(
|
||||
n,
|
||||
x, incx,
|
||||
y, incy
|
||||
)
|
||||
);
|
||||
return ( float )
|
||||
(
|
||||
( double )(*sb) +
|
||||
PASTEF77(d,sdot)
|
||||
(
|
||||
n,
|
||||
x, incx,
|
||||
y, incy
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Input vectors stored in single precision, computed in double precision,
|
||||
@@ -339,39 +651,39 @@ double PASTEF77(d,sdot)
|
||||
const float* y, const f77_int* incy
|
||||
)
|
||||
{
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
dim_t i;
|
||||
dim_t n0;
|
||||
float* x0;
|
||||
float* y0;
|
||||
inc_t incx0;
|
||||
inc_t incy0;
|
||||
double rho;
|
||||
dim_t i;
|
||||
|
||||
/* Initialization of BLIS is not required. */
|
||||
/* Initialization of BLIS is not required. */
|
||||
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
/* Convert/typecast negative values of n to zero. */
|
||||
bli_convert_blas_dim1( *n, n0 );
|
||||
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
|
||||
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
|
||||
/* If the input increments are negative, adjust the pointers so we can
|
||||
use positive increments instead. */
|
||||
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
|
||||
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
|
||||
|
||||
rho = 0.0;
|
||||
rho = 0.0;
|
||||
|
||||
for ( i = 0; i < n0; i++ )
|
||||
{
|
||||
float* chi1 = x0 + (i )*incx0;
|
||||
float* psi1 = y0 + (i )*incy0;
|
||||
for ( i = 0; i < n0; i++ )
|
||||
{
|
||||
float* chi1 = x0 + (i )*incx0;
|
||||
float* psi1 = y0 + (i )*incy0;
|
||||
|
||||
bli_ddots( (( double )(*chi1)),
|
||||
(( double )(*psi1)), rho );
|
||||
}
|
||||
bli_ddots( (( double )(*chi1)),
|
||||
(( double )(*psi1)), rho );
|
||||
}
|
||||
|
||||
/* Finalization of BLIS is not required, because initialization was
|
||||
not required. */
|
||||
/* Finalization of BLIS is not required, because initialization was
|
||||
not required. */
|
||||
|
||||
return rho;
|
||||
return rho;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -58,9 +58,11 @@ AXPYV_KER_PROT( double, d, axpyv_zen_int )
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int )
|
||||
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( scomplex, c, dotv_zen_int5 )
|
||||
DOTV_KER_PROT( dcomplex, z, dotv_zen_int5 )
|
||||
|
||||
// dotxv (intrinsics)
|
||||
DOTXV_KER_PROT( float, s, dotxv_zen_int )
|
||||
|
||||
4
kernels/zen2/.gitignore
vendored
Normal file
4
kernels/zen2/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
# Ignore everything in this directory
|
||||
*
|
||||
# Except this file
|
||||
!.gitignore
|
||||
Reference in New Issue
Block a user