From 4fcc4e499d807259f510af1c1f750402107ecdaa Mon Sep 17 00:00:00 2001 From: Meghana Vankadari Date: Thu, 14 May 2020 11:04:05 +0530 Subject: [PATCH] Optimized DGEMV kernel and changed BLAS interface call Details: - Optimized daxpyf kernel with fuse_factor=5 and iter_unroll=2. - Modified framework files of dgemv to remove dependency on cntx variable. - Updated cntx_init file of zen2 to choose optimized kernels. - Modified BLAS interface call for DGEMV to reduce framework overhread. - Currently these changes are applicable for zen2 configuration. They will be enabled for zen family processors in future. - Changed naming convention for new BLAS macros to indicate their use. - Added new optimized kernel for axpyf under zen2 folder. - Implemented basic GEMV kernel without using axpyv or axpyf. This kernel is chosen for small sizes. Change-Id: I4278d37e494854879c71499b8b9da8c5dbe3bf5b Signed-off-by: Meghana Vankadari AMD-Internal: [CPUPL-885] --- common.mk | 2 + config/zen2/bli_cntx_init_zen2.c | 4 +- frame/2/bli_l2_ker.h | 50 +++ frame/2/bli_l2_ker_prot.h | 54 +++ frame/2/gemv/bli_gemv_unf_var1.c | 74 +++- frame/2/gemv/bli_gemv_unf_var2.c | 81 ++++- frame/3/trsm/bli_trsm_front.c | 9 +- frame/compat/bla_amax.c | 2 +- frame/compat/bla_axpy.c | 2 +- frame/compat/bla_copy.c | 2 +- frame/compat/bla_dot.c | 2 +- frame/compat/bla_gemv.c | 449 ++++++++++++++++++++---- frame/compat/bla_scal.c | 2 +- frame/compat/bla_swap.c | 2 +- frame/include/bli_arch_config.h | 8 +- frame/include/bli_gentfunc_macro_defs.h | 14 +- frame/include/blis.h | 3 +- kernels/zen2/.gitignore | 4 - kernels/zen2/1f/bli_axpyf_zen_int_5.c | 326 +++++++++++++++++ kernels/zen2/2/bli_gemv_zen_ref.c | 103 ++++++ kernels/zen2/bli_kernels_zen2.h | 45 +++ 21 files changed, 1138 insertions(+), 100 deletions(-) create mode 100644 frame/2/bli_l2_ker.h create mode 100644 frame/2/bli_l2_ker_prot.h delete mode 100644 kernels/zen2/.gitignore create mode 100644 kernels/zen2/1f/bli_axpyf_zen_int_5.c create mode 100644 kernels/zen2/2/bli_gemv_zen_ref.c create mode 100644 kernels/zen2/bli_kernels_zen2.h diff --git a/common.mk b/common.mk index e1ec24a29..cc53eaad2 100644 --- a/common.mk +++ b/common.mk @@ -5,6 +5,7 @@ # libraries. # # Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2020, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -153,6 +154,7 @@ get-aocldtl-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \ get-kernel-cflags-for = $(strip $(call load-var-for,CKOPTFLAGS,$(1)) \ $(call load-var-for,CKVECFLAGS,$(1)) \ $(call get-noopt-cflags-for,$(1)) \ + $(COMPSIMDFLAGS) \ $(BUILD_CPPFLAGS) \ $(BUILD_SYMFLAGS) \ ) diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 5dd8c041b..205ae774a 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -69,7 +69,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, @@ -128,7 +128,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); #endif - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache diff --git a/frame/2/bli_l2_ker.h b/frame/2/bli_l2_ker.h new file mode 100644 index 000000000..ca6e1d247 --- /dev/null +++ b/frame/2/bli_l2_ker.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Define template prototypes for level-2 kernels. +// + +// Note: Instead of defining function prototype macro templates and then +// instantiating those macros to define the individual function prototypes, +// we simply alias the official operations' prototypes as defined in +// bli_l2_ker_prot.h. + +#undef GENTPROT +#define GENTPROT GEMV_KER_PROT + +INSERT_GENTPROT_BASIC0( gemv_ker_name ) + diff --git a/frame/2/bli_l2_ker_prot.h b/frame/2/bli_l2_ker_prot.h new file mode 100644 index 000000000..15888a55f --- /dev/null +++ b/frame/2/bli_l2_ker_prot.h @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Define template prototypes for level-1f kernels. +// + +#define GEMV_KER_PROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t m, \ + dim_t n, \ + ctype* restrict alpha, \ + ctype* restrict a, inc_t lda, \ + ctype* restrict x, inc_t incx, \ + ctype* restrict beta, \ + ctype* restrict y, inc_t incy, \ + cntx_t* restrict cntx \ + ); + diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index e392e830e..cfbaa8fd4 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ + bli_init_once(); \ +\ + if( cntx == NULL) cntx = bli_gks_query_cntx(); \ +\ const num_t dt = PASTEMAC(ch,type); \ \ ctype* A1; \ @@ -100,5 +105,72 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( gemv_unf_var1 ) +#ifdef BLIS_CONFIG_ZEN2 +void bli_dgemv_unf_var1 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + double* A1; + double* x1; + double* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + bli_init_once(); + + if( cntx == NULL ) cntx = bli_gks_query_cntx(); + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + + /* Query the context for the kernel function pointer and fusing factor. */ + b_fuse = 8; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + bli_ddotxf_zen_int_8 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); + + } +} + +INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var1 ) +#else +INSERT_GENTFUNC_BASIC0( gemv_unf_var1 ) +#endif diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index fe7702e4c..403bd4f78 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -51,6 +52,10 @@ void PASTEMAC(ch,varname) \ cntx_t* cntx \ ) \ { \ + bli_init_once(); \ +\ + if(cntx == NULL) cntx = bli_gks_query_cntx(); \ +\ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ @@ -127,5 +132,79 @@ void PASTEMAC(ch,varname) \ } \ } -INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) +#ifdef BLIS_CONFIG_ZEN2 +void bli_dgemv_unf_var2 + ( + trans_t transa, + conj_t conjx, + dim_t m, + dim_t n, + double* alpha, + double* a, inc_t rs_a, inc_t cs_a, + double* x, inc_t incx, + double* beta, + double* y, inc_t incy, + cntx_t* cntx + ) +{ + + double* A1; + double* x1; + double* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_elem, &n_iter, &rs_at, &cs_at ); + + conja = bli_extract_conj( transa ); + + /* If beta is zero, use setv. Otherwise, scale by beta. */ + /* y = beta * y; */ + /* beta=0 case is hadled by scalv internally */ + + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + NULL + ); + + /* Query the context for the kernel function pointer and fusing factor. */ + b_fuse = 5; + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + + A1 = a + (0 )*rs_at + (i )*cs_at; + x1 = x + (i )*incx; + y1 = y + (0 )*incy; + + /* y = y + alpha * A1 * x1; */ + bli_daxpyf_zen_int_5 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, rs_at, cs_at, + x1, incx, + y1, incy, + NULL + ); + } +} + +INSERT_GENTFUNC_BASIC0_SCZ( gemv_unf_var2 ) +#else +INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) +#endif diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index eef982e0f..6685e756c 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -48,16 +48,11 @@ void bli_trsm_front ) { bli_init_once(); - int i, j; + obj_t a_local; obj_t b_local; obj_t c_local; -//int m = bli_obj_length(*b); -//int n = bli_obj_width(*b); -//float *L = a->buffer; - // float *B = b->buffer; - #ifdef PRINT_SMALL_TRSM_INFO printf("Side:: %c\n", side ? 'R' : 'L'); if (bli_obj_datatype(*a) == BLIS_FLOAT) diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c index 33fe83b90..cad33e1bf 100644 --- a/frame/compat/bla_amax.c +++ b/frame/compat/bla_amax.c @@ -238,7 +238,7 @@ f77_int idamax_ return f77_index; } -INSERT_GENTFUNC_BLAS_ZEN2( amax, amaxv ) +INSERT_GENTFUNC_BLAS_CZ( amax, amaxv ) #else INSERT_GENTFUNC_BLAS( amax, amaxv ) #endif diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c index 0c0943419..ad1689898 100644 --- a/frame/compat/bla_axpy.c +++ b/frame/compat/bla_axpy.c @@ -224,7 +224,7 @@ void daxpy_ // bli_finalize_auto(); } -INSERT_GENTFUNC_BLAS_ZEN2( axpy, axpyv ) +INSERT_GENTFUNC_BLAS_CZ( axpy, axpyv ) #else INSERT_GENTFUNC_BLAS( axpy, axpyv ) diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c index 369476b19..d73796664 100644 --- a/frame/compat/bla_copy.c +++ b/frame/compat/bla_copy.c @@ -236,7 +236,7 @@ void dcopy_ // bli_finalize_auto(); } -INSERT_GENTFUNC_BLAS_ZEN2(copy, copyv) +INSERT_GENTFUNC_BLAS_CZ(copy, copyv) #else INSERT_GENTFUNC_BLAS(copy, copyv) #endif diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c index ed97d89b1..eaf2022af 100644 --- a/frame/compat/bla_dot.c +++ b/frame/compat/bla_dot.c @@ -247,7 +247,7 @@ double ddot_ return rho; } -INSERT_GENTFUNCDOT_BLAS_ZEN2( dot, dotv ) +INSERT_GENTFUNCDOT_BLAS_CZ( dot, dotv ) #else INSERT_GENTFUNCDOT_BLAS( dot, dotv ) #endif diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c index 85c65dde4..9e117f537 100644 --- a/frame/compat/bla_gemv.c +++ b/frame/compat/bla_gemv.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -53,90 +54,396 @@ void PASTEF77(ch,blasname) \ ftype* y, const f77_int* incy \ ) \ { \ - trans_t blis_transa; \ - dim_t m0, n0; \ - dim_t m_y, n_x; \ - ftype* x0; \ - ftype* y0; \ - inc_t incx0; \ - inc_t incy0; \ - inc_t rs_a, cs_a; \ + trans_t blis_transa; \ + dim_t m0, n0; \ + dim_t m_y, n_x; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ + inc_t rs_a, cs_a; \ \ - /* Initialize BLIS. */ \ - bli_init_auto(); \ + /* Initialize BLIS. */ \ + bli_init_auto(); \ \ - /* Perform BLAS parameter checking. */ \ - PASTEBLACHK(blasname) \ - ( \ - MKSTR(ch), \ - MKSTR(blasname), \ - transa, \ - m, \ - n, \ - lda, \ - incx, \ - incy \ - ); \ + /* Perform BLAS parameter checking. */ \ + PASTEBLACHK(blasname) \ + ( \ + MKSTR(ch), \ + MKSTR(blasname), \ + transa, \ + m, \ + n, \ + lda, \ + incx, \ + incy \ + ); \ \ - /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ - bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ + bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ - /* Convert/typecast negative values of m and n to zero. */ \ - bli_convert_blas_dim1( *m, m0 ); \ - bli_convert_blas_dim1( *n, n0 ); \ + /* Convert/typecast negative values of m and n to zero. */ \ + bli_convert_blas_dim1( *m, m0 ); \ + bli_convert_blas_dim1( *n, n0 ); \ \ - /* Determine the dimensions of x and y so we can adjust the increments, - if necessary.*/ \ - bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ \ + bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ \ - /* BLAS handles cases where trans(A) has no columns, and x has no elements, - in a peculiar way. In these situations, BLAS returns without performing - any action, even though most sane interpretations of gemv would have the - the operation reduce to y := beta * y. Here, we catch those cases that - BLAS would normally mishandle and emulate the BLAS exactly so as to - provide "bug-for-bug" compatibility. Note that this extreme level of - compatibility would not be as much of an issue if it weren't for the - fact that some BLAS test suites actually test for these cases. Also, it - should be emphasized that BLIS, if called natively, does NOT exhibit - this quirky behavior; it will scale y by beta, as one would expect. */ \ - if ( m_y > 0 && n_x == 0 ) \ - { \ - /* Finalize BLIS. */ \ - bli_finalize_auto(); \ + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ \ + if ( m_y > 0 && n_x == 0 ) \ + { \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ \ - return; \ - } \ + return; \ + } \ \ - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ \ - bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ - bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ \ - /* Set the row and column strides of A. */ \ - rs_a = 1; \ - cs_a = *lda; \ + /* Set the row and column strides of A. */ \ + rs_a = 1; \ + cs_a = *lda; \ \ - /* Call BLIS interface. */ \ - PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ - ( \ - blis_transa, \ - BLIS_NO_CONJUGATE, \ - m0, \ - n0, \ - (ftype*)alpha, \ - (ftype*)a, rs_a, cs_a, \ - x0, incx0, \ - (ftype*)beta, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + blis_transa, \ + BLIS_NO_CONJUGATE, \ + m0, \ + n0, \ + (ftype*)alpha, \ + (ftype*)a, rs_a, cs_a, \ + x0, incx0, \ + (ftype*)beta, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ \ - /* Finalize BLIS. */ \ - bli_finalize_auto(); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ } + #ifdef BLIS_ENABLE_BLAS +#ifdef BLIS_CONFIG_ZEN2 +void dgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const double* alpha, + const double* a, const f77_int* lda, + const double* x, const f77_int* incx, + const double* beta, + double* y, const f77_int* incy + ) +{ + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(d), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + //bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if ( *m < 0 ) m0 = ( dim_t )0; + else m0 = ( dim_t )(*m); + + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; } + else { m_y = n0; n_x = m0; } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + if ( m_y > 0 && n_x == 0 ) + { + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + x0 = ((double*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((double*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + /* Call variants based on transpose value. */ + if(bli_does_notrans(blis_transa)) + { + if((m0 <1200) && (n0 < 1200)) + { + bli_dgemv_zen_basic + ( + m0, + n0, + (double*) alpha, + (double*)a, cs_a, + x0, incx0, + (double*) beta, + y0, incy0, + NULL + ); + } + else + { + //variant_2 is chosen for column-storage + // and uses axpyf-based implementation + bli_dgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (double*)alpha, + (double*)a, rs_a, cs_a, + x0, incx0, + (double*)beta, + y0, incy0, + NULL + ); + } + } + else + { + //var_1 is chosen for row-storage + //and uses dotxf-based implementation + bli_dgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (double*)alpha, + (double*)a, rs_a, cs_a, + x0, incx0, + (double*)beta, + y0, incy0, + NULL + ); + + } + +} + +void sgemv_ + ( + const f77_char* transa, + const f77_int* m, + const f77_int* n, + const float* alpha, + const float* a, const f77_int* lda, + const float* x, const f77_int* incx, + const float* beta, + float* y, const f77_int* incy + ) +{ + trans_t blis_transa; + dim_t m0, n0; + dim_t m_y, n_x; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; + inc_t rs_a, cs_a; + + /* Perform BLAS parameter checking. */ + PASTEBLACHK(gemv) + ( + MKSTR(s), + MKSTR(gemv), + transa, + m, + n, + lda, + incx, + incy + ); + + /* Map BLAS chars to their corresponding BLIS enumerated type value. */ + if ( *transa == 'n' || *transa == 'N' ) blis_transa = BLIS_NO_TRANSPOSE; + else if ( *transa == 't' || *transa == 'T' ) blis_transa = BLIS_TRANSPOSE; + else if ( *transa == 'c' || *transa == 'C' ) blis_transa = BLIS_CONJ_TRANSPOSE; + else + { + // See comment for bli_param_map_netlib_to_blis_side() above. + //bli_check_error_code( BLIS_INVALID_TRANS ); + blis_transa = BLIS_NO_TRANSPOSE; + } + + /* Convert/typecast negative values of m and n to zero. */ + if ( *m < 0 ) m0 = ( dim_t )0; + else m0 = ( dim_t )(*m); + + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* Determine the dimensions of x and y so we can adjust the increments, + if necessary.*/ + if ( bli_does_notrans( blis_transa ) ) { m_y = m0; n_x = n0; } + else { m_y = n0; n_x = m0; } + + /* BLAS handles cases where trans(A) has no columns, and x has no elements, + in a peculiar way. In these situations, BLAS returns without performing + any action, even though most sane interpretations of gemv would have the + the operation reduce to y := beta * y. Here, we catch those cases that + BLAS would normally mishandle and emulate the BLAS exactly so as to + provide "bug-for-bug" compatibility. Note that this extreme level of + compatibility would not be as much of an issue if it weren't for the + fact that some BLAS test suites actually test for these cases. Also, it + should be emphasized that BLIS, if called natively, does NOT exhibit + this quirky behavior; it will scale y by beta, as one would expect. */ + if ( m_y > 0 && n_x == 0 ) + { + /* Finalize BLIS. */ +// bli_finalize_auto(); + + return; + } + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + x0 = ((float*)x) + (n_x-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + + if ( *incy < 0 ) + { + y0 = ((float*)y) + (m_y-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } + + /* Set the row and column strides of A. */ + rs_a = 1; + cs_a = *lda; + + /* Call variants based on transpose value. */ + if(bli_does_notrans(blis_transa)) + { + bli_sgemv_unf_var2 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (float*)alpha, + (float*)a, rs_a, cs_a, + x0, incx0, + (float*)beta, + y0, incy0, + NULL + ); + + } + else + { + bli_sgemv_unf_var1 + ( + blis_transa, + BLIS_NO_CONJUGATE, + m0, + n0, + (float*)alpha, + (float*)a, rs_a, cs_a, + x0, incx0, + (float*)beta, + y0, incy0, + NULL + ); + + } + +} + +INSERT_GENTFUNC_BLAS_CZ( gemv, gemv ) +#else INSERT_GENTFUNC_BLAS( gemv, gemv ) #endif - +#endif diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c index 3b8dda884..848d42094 100644 --- a/frame/compat/bla_scal.c +++ b/frame/compat/bla_scal.c @@ -203,7 +203,7 @@ void dscal_ // bli_finalize_auto(); } -INSERT_GENTFUNCSCAL_BLAS_ZEN2( scal, scalv ) +INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv ) #else INSERT_GENTFUNCSCAL_BLAS( scal, scalv ) #endif diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c index 3b589b212..8d8d40628 100644 --- a/frame/compat/bla_swap.c +++ b/frame/compat/bla_swap.c @@ -229,7 +229,7 @@ void dswap_ // bli_finalize_auto(); } -INSERT_GENTFUNC_BLAS_ZEN2( swap, swapv ) +INSERT_GENTFUNC_BLAS_CZ( swap, swapv ) #else INSERT_GENTFUNC_BLAS( swap, swapv ) diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 3aca1cba5..1dd722ed3 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -6,7 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP - Copyright (C) 2019, Advanced Micro Devices, Inc. + Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -244,9 +244,9 @@ CNTX_INIT_PROTS( generic ) // -- AMD64 architectures -- -//#ifdef BLIS_KERNELS_ZEN2 -//#include "bli_kernels_zen2.h" -//#endif +#ifdef BLIS_KERNELS_ZEN2 +#include "bli_kernels_zen2.h" +#endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" #endif diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index 6ea378ee5..e2a06b332 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -57,7 +57,7 @@ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) -#define INSERT_GENTFUNC_BLAS_ZEN2( blasname, blisname ) \ +#define INSERT_GENTFUNC_BLAS_CZ( blasname, blisname ) \ \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) @@ -82,7 +82,7 @@ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- -#define INSERT_GENTFUNCDOT_BLAS_ZEN2( blasname, blisname ) \ +#define INSERT_GENTFUNCDOT_BLAS_CZ( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ @@ -125,7 +125,7 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- -#define INSERT_GENTFUNCSCAL_BLAS_ZEN2( blasname, blisname ) \ +#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \ \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ @@ -157,6 +157,14 @@ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) + +#define INSERT_GENTFUNC_BASIC0_SCZ( tfuncname ) \ +\ +GENTFUNC( float, s, tfuncname ) \ +GENTFUNC( scomplex, c, tfuncname ) \ +GENTFUNC( dcomplex, z, tfuncname ) + + // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ diff --git a/frame/include/blis.h b/frame/include/blis.h index 4725ee83f..3b4d85f9e 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -6,7 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -87,6 +87,7 @@ extern "C" { #include "bli_l1v_ker_prot.h" #include "bli_l1f_ker_prot.h" #include "bli_l1m_ker_prot.h" +#include "bli_l2_ker_prot.h" #include "bli_l3_ukr_prot.h" #include "bli_l3_sup_ker_prot.h" diff --git a/kernels/zen2/.gitignore b/kernels/zen2/.gitignore deleted file mode 100644 index 5e7d2734c..000000000 --- a/kernels/zen2/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c new file mode 100644 index 000000000..36654f9c2 --- /dev/null +++ b/kernels/zen2/1f/bli_axpyf_zen_int_5.c @@ -0,0 +1,326 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +/* Union data structure to access AVX registers + One 256-bit AVX register holds 8 SP elements. */ +typedef union +{ + __m256 v; + float f[8] __attribute__((aligned(64))); +} v8sf_t; + +/* Union data structure to access AVX registers +* One 256-bit AVX register holds 4 DP elements. */ +typedef union +{ + __m256d v; + double d[4] __attribute__((aligned(64))); +} v4df_t; + +// ----------------------------------------------------------------------------- + +void bli_daxpyf_zen_int_5 + ( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t fuse_fac = 5; + + const dim_t n_elem_per_reg = 4; + const dim_t n_iter_unroll = 2; + + dim_t i; + + double* restrict a0; + double* restrict a1; + double* restrict a2; + double* restrict a3; + double* restrict a4; + + double* restrict y0; + + v4df_t chi0v, chi1v, chi2v, chi3v; + v4df_t chi4v; + + v4df_t a00v, a01v, a02v, a03v; + v4df_t a04v; + + v4df_t a10v, a11v, a12v, a13v; + v4df_t a14v; + + v4df_t y0v, y1v; + + double chi0, chi1, chi2, chi3; + double chi4; + + // If either dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return; + + // If b_n is not equal to the fusing factor, then perform the entire + // operation as a loop over axpyv. + if ( b_n != fuse_fac ) + { +#ifdef BLIS_CONFIG_ZEN2 + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 ); + PASTEMAC(d,scals)( *alpha, alpha_chi1 ); + + bli_daxpyv_zen_int10 + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#else + daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); + + for ( i = 0; i < b_n; ++i ) + { + double* a1 = a + (0 )*inca + (i )*lda; + double* chi1 = x + (i )*incx; + double* y1 = y + (0 )*incy; + double alpha_chi1; + + PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 ); + PASTEMAC(d,scals)( *alpha, alpha_chi1 ); + + f + ( + conja, + m, + &alpha_chi1, + a1, inca, + y1, incy, + cntx + ); + } + +#endif + return; + } + + // At this point, we know that b_n is exactly equal to the fusing factor. + + a0 = a + 0*lda; + a1 = a + 1*lda; + a2 = a + 2*lda; + a3 = a + 3*lda; + a4 = a + 4*lda; + y0 = y; + + chi0 = *( x + 0*incx ); + chi1 = *( x + 1*incx ); + chi2 = *( x + 2*incx ); + chi3 = *( x + 3*incx ); + chi4 = *( x + 4*incx ); + + + // Scale each chi scalar by alpha. + PASTEMAC(d,scals)( *alpha, chi0 ); + PASTEMAC(d,scals)( *alpha, chi1 ); + PASTEMAC(d,scals)( *alpha, chi2 ); + PASTEMAC(d,scals)( *alpha, chi3 ); + PASTEMAC(d,scals)( *alpha, chi4 ); + + // Broadcast the (alpha*chi?) scalars to all elements of vector registers. + chi0v.v = _mm256_broadcast_sd( &chi0 ); + chi1v.v = _mm256_broadcast_sd( &chi1 ); + chi2v.v = _mm256_broadcast_sd( &chi2 ); + chi3v.v = _mm256_broadcast_sd( &chi3 ); + chi4v.v = _mm256_broadcast_sd( &chi4 ); + + // If there are vectorized iterations, perform them with vector + // instructions. + if ( inca == 1 && incy == 1 ) + { + for ( i = 0; (i + 7) < m; i += 8 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); + + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); + + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); + + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); + + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg ); + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); + + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v ); + + + // Store the output. + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v ); + + y0 += n_iter_unroll * n_elem_per_reg; + a0 += n_iter_unroll * n_elem_per_reg; + a1 += n_iter_unroll * n_elem_per_reg; + a2 += n_iter_unroll * n_elem_per_reg; + a3 += n_iter_unroll * n_elem_per_reg; + a4 += n_iter_unroll * n_elem_per_reg; + } + + for( ; (i + 3) < m; i += 4 ) + { + // Load the input values. + y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); + a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); + a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); + a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); + a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); + + + // perform : y += alpha * x; + y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); + y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); + + // Store the output. + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); + + y0 += n_elem_per_reg; + a0 += n_elem_per_reg; + a1 += n_elem_per_reg; + a2 += n_elem_per_reg; + a3 += n_elem_per_reg; + a4 += n_elem_per_reg; + } + + // If there are leftover iterations, perform them with scalar code. + for ( ; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += 1; + a1 += 1; + a2 += 1; + a3 += 1; + a4 += 1; + y0 += 1; + } + } + else + { + for ( i = 0; (i + 0) < m ; ++i ) + { + double y0c = *y0; + + const double a0c = *a0; + const double a1c = *a1; + const double a2c = *a2; + const double a3c = *a3; + const double a4c = *a4; + + y0c += chi0 * a0c; + y0c += chi1 * a1c; + y0c += chi2 * a2c; + y0c += chi3 * a3c; + y0c += chi4 * a4c; + + *y0 = y0c; + + a0 += inca; + a1 += inca; + a2 += inca; + a3 += inca; + a4 += inca; + y0 += incy; + } + + } +} + diff --git a/kernels/zen2/2/bli_gemv_zen_ref.c b/kernels/zen2/2/bli_gemv_zen_ref.c new file mode 100644 index 000000000..31138f2a8 --- /dev/null +++ b/kernels/zen2/2/bli_gemv_zen_ref.c @@ -0,0 +1,103 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "immintrin.h" +#include "blis.h" + +/* This routine assumes that the matrix is stored in column-major order + * and does not have transpose. + */ +void bli_dgemv_zen_ref_c + ( + dim_t m, + dim_t n, + double* restrict alpha, + double* restrict a, inc_t lda, + double* restrict x, inc_t incx, + double* restrict beta, + double* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + + double* restrict x0 = x; + double* restrict y0 = y; + double* restrict a0 = a; + + dim_t i,j; + + if((incy == 1) && (incx == 1)) + { + + /* n==0 case will be handled in the framework, + where it returns immediately. + * For the first column of A, execute Y = alpha*A*X + beta*Y + */ + double x0_val = (x0[0]); + PRAGMA_SIMD + for(i = 0; i < m; i++) + { + (y0[i]) = (a0[i]) * (x0_val) * (*alpha) + y0[i] * (*beta); + } + a0 += lda; + + /* For remaining columns of A, execute, Y = alpha*A*X + Y; */ + for(j = 1; j < n; j++) + { + const double xp = (x0[j]); + PRAGMA_SIMD + for(i = 0; i < m; i++) + { + (y0[i]) += (a0[i]) * xp * (*alpha); + } + a0 += lda; + } + + } + else + { + for(j = 0; j < m; j++) + PASTEMAC(d,scals)(*beta, *(y0+j*incy)) + + for(j = 0; j < n; j++) + { + const double xp = *(x0+j*incx); + for(i = 0; i < m; i++) + { + *(y0 + i*incy) += (a0[j*lda+i]) * xp * (*alpha); + } + } + } + return; +} diff --git a/kernels/zen2/bli_kernels_zen2.h b/kernels/zen2/bli_kernels_zen2.h new file mode 100644 index 000000000..837eac344 --- /dev/null +++ b/kernels/zen2/bli_kernels_zen2.h @@ -0,0 +1,45 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// -- level-1f -- +AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) + +// -- level-2 -- + +//gemv(scalar code) +GEMV_KER_PROT( double, d, gemv_zen_basic ) + +