From 28bb28b79f3c7c836e08308c3565f3fb0cd348ea Mon Sep 17 00:00:00 2001 From: Meghana Date: Thu, 30 Apr 2020 17:09:39 +0530 Subject: [PATCH] Modified Function definition for BLAS and CBLAS interfaces of DOTV and SWAPV APIs Details: -Kernel is called directly from API call to avoid framework overhead in case of single and double precisions. -Currently these changes are applicable only for zen2 configuration. They will be enabled for zen family processors in future. -These changes improve performance of BLAS and CBLAS interfaces of API. They do not affect BLIS-specific APIs. Change-Id: I1eb7ca470ced82c3cfa8b22f2b53000d42fef96c Signed-off-by: Meghana Vankadari AMD-Internal: [CPUPL-847,CPUPL-816] --- frame/compat/bla_dot.c | 165 ++++++++++++++++++- frame/compat/bla_swap.c | 202 +++++++++++++++++++++--- frame/compat/cblas/src/cblas_ddot.c | 75 +++++++++ frame/compat/cblas/src/cblas_dswap.c | 70 ++++++++ frame/compat/cblas/src/cblas_sdot.c | 74 +++++++++ frame/compat/cblas/src/cblas_sswap.c | 74 ++++++++- frame/include/bli_gentfunc_macro_defs.h | 9 ++ 7 files changed, 642 insertions(+), 27 deletions(-) diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c index dbab039d1..ed97d89b1 100644 --- a/frame/compat/bla_dot.c +++ b/frame/compat/bla_dot.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -86,8 +87,170 @@ ftype PASTEF772(ch,blasname,chc) \ } #ifdef BLIS_ENABLE_BLAS -INSERT_GENTFUNCDOT_BLAS( dot, dotv ) +#ifdef BLIS_CONFIG_ZEN2 +float sdot_ + ( + const f77_int* n, + const float* x, const f77_int* incx, + const float* y, const f77_int* incy + ) +{ + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + float rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((float*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((float*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } + + /* Call BLIS kernel. */ + bli_sdotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return rho; +} + +double ddot_ + ( + const f77_int* n, + const double* x, const f77_int* incx, + const double* y, const f77_int* incy + ) +{ + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + double rho; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((double*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((double*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } + + /* Call BLIS kernel. */ + bli_ddotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &rho, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return rho; +} + +INSERT_GENTFUNCDOT_BLAS_ZEN2( dot, dotv ) +#else +INSERT_GENTFUNCDOT_BLAS( dot, dotv ) +#endif // -- "Black sheep" dot product function definitions -- diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c index 72bc9d6d5..3b589b212 100644 --- a/frame/compat/bla_swap.c +++ b/frame/compat/bla_swap.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -48,38 +49,189 @@ void PASTEF77(ch,blasname) \ ftype* y, const f77_int* incy \ ) \ { \ - dim_t n0; \ - ftype* x0; \ - ftype* y0; \ - inc_t incx0; \ - inc_t incy0; \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ \ - /* Initialize BLIS. */ \ - bli_init_auto(); \ + /* Initialize BLIS. */ \ + bli_init_auto(); \ \ - /* Convert/typecast negative values of n to zero. */ \ - bli_convert_blas_dim1( *n, n0 ); \ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ \ - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ \ - bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ - bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ - /* Call BLIS interface. */ \ - PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ - ( \ - n0, \ - x0, incx0, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + n0, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ \ - /* Finalize BLIS. */ \ - bli_finalize_auto(); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS +#ifdef BLIS_CONFIG_ZEN2 + +void sswap_ + ( + const f77_int* n, + float* x, const f77_int* incx, + float* y, const f77_int* incy + ) +{ + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = (y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = (y); + incy0 = ( inc_t )(*incy); + } + + + /* Call BLIS kernel */ + bli_sswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +void dswap_ + ( + const f77_int* n, + double* x, const f77_int* incx, + double* y, const f77_int* incy + ) +{ + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + + } + else + { + x0 = (x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = (y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + + } + else + { + y0 = (y); + incy0 = ( inc_t )(*incy); + } + + + /* Call BLIS kernel */ + bli_dswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +} + +INSERT_GENTFUNC_BLAS_ZEN2( swap, swapv ) + +#else INSERT_GENTFUNC_BLAS( swap, swapv ) #endif - +#endif diff --git a/frame/compat/cblas/src/cblas_ddot.c b/frame/compat/cblas/src/cblas_ddot.c index b1675d888..b7bde2156 100644 --- a/frame/compat/cblas/src/cblas_ddot.c +++ b/frame/compat/cblas/src/cblas_ddot.c @@ -7,6 +7,8 @@ * It calls the fortran wrapper before calling ddot. * * Written by Keita Teranishi. 2/11/1998 + * + * Copyright (C) 2020, Advanced Micro Devices, Inc. * */ #include "cblas.h" @@ -22,7 +24,80 @@ double cblas_ddot( f77_int N, const double *X, #define F77_incX incX #define F77_incY incY #endif +#ifdef BLIS_CONFIG_ZEN2 + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((double*)X) + (n0-1)*(-F77_incX); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = ((double*)X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = ((double*)Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = ((double*)Y); + incy0 = ( inc_t )(F77_incY); + } + + /* Call BLIS kernel. */ + bli_ddotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &dot, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return dot; + +#else F77_ddot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); return dot; +#endif } #endif diff --git a/frame/compat/cblas/src/cblas_dswap.c b/frame/compat/cblas/src/cblas_dswap.c index e204baea0..9024e308a 100644 --- a/frame/compat/cblas/src/cblas_dswap.c +++ b/frame/compat/cblas/src/cblas_dswap.c @@ -7,6 +7,8 @@ * * Written by Keita Teranishi. 2/11/1998 * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * */ #include "cblas.h" #include "cblas_f77.h" @@ -20,6 +22,74 @@ void cblas_dswap( f77_int N, double *X, f77_int incX, double *Y, #define F77_incX incX #define F77_incY incY #endif + +#ifdef BLIS_CONFIG_ZEN2 + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (X) + (n0-1)*(-F77_incX); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = (X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = (Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = (Y); + incy0 = ( inc_t )(F77_incY); + } + + + /* Call BLIS kernel */ + bli_dswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); +#else F77_dswap( &F77_N, X, &F77_incX, Y, &F77_incY); +#endif } #endif diff --git a/frame/compat/cblas/src/cblas_sdot.c b/frame/compat/cblas/src/cblas_sdot.c index bbf355887..602e7c957 100644 --- a/frame/compat/cblas/src/cblas_sdot.c +++ b/frame/compat/cblas/src/cblas_sdot.c @@ -8,6 +8,8 @@ * * Written by Keita Teranishi. 2/11/1998 * + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * */ #include "cblas.h" #include "cblas_f77.h" @@ -22,7 +24,79 @@ float cblas_sdot( f77_int N, const float *X, #define F77_incX incX #define F77_incY incY #endif +#ifdef BLIS_CONFIG_ZEN2 + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = ((float*)X) + (n0-1)*(-F77_incX); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = ((float*)X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = ((float*)Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = ((float*)Y); + incy0 = ( inc_t )(F77_incY); + } + + /* Call BLIS kernel. */ + bli_sdotv_zen_int10 + ( + BLIS_NO_CONJUGATE, + BLIS_NO_CONJUGATE, + n0, + x0, incx0, + y0, incy0, + &dot, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return dot; +#else F77_sdot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); return dot; +#endif } #endif diff --git a/frame/compat/cblas/src/cblas_sswap.c b/frame/compat/cblas/src/cblas_sswap.c index 2c24ad0fa..ea7aa7207 100644 --- a/frame/compat/cblas/src/cblas_sswap.c +++ b/frame/compat/cblas/src/cblas_sswap.c @@ -7,7 +7,9 @@ * * Written by Keita Teranishi. 2/11/1998 * - */ + * Copyright (C) 2020, Advanced Micro Devices, Inc. + * +*/ #include "cblas.h" #include "cblas_f77.h" void cblas_sswap( f77_int N, float *X, f77_int incX, float *Y, @@ -20,6 +22,76 @@ void cblas_sswap( f77_int N, float *X, f77_int incX, float *Y, #define F77_incX incX #define F77_incY incY #endif + +#ifdef BLIS_CONFIG_ZEN2 + + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ +// bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( F77_N < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(F77_N); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( F77_incX < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + + x0 = (X) + (n0-1)*(-F77_incX); + incx0 = ( inc_t )(F77_incX); + + } + else + { + x0 = (X); + incx0 = ( inc_t )(F77_incX); + } + + if ( F77_incY < 0 ) + { + y0 = (Y) + (n0-1)*(-F77_incY); + incy0 = ( inc_t )(F77_incY); + + } + else + { + y0 = (Y); + incy0 = ( inc_t )(F77_incY); + } + + + /* Call BLIS kernel */ + bli_sswapv_zen_int8 + ( + n0, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ +// bli_finalize_auto(); + +#else F77_sswap( &F77_N, X, &F77_incX, Y, &F77_incY); +#endif } #endif diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 6e7969a5c..39920466a 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -56,10 +56,12 @@ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) + #define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \ \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) + // -- Basic one-operand macro with real domain only -- @@ -80,6 +82,13 @@ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- +#define INSERT_GENTFUNCDOT_BLAS_ZEN2( blasname, blisname ) \ +\ +GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ +GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ +GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ +GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) + #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \