From e9899be09044829e23386bd73e394f1dd7778210 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 16 Sep 2014 18:19:32 -0500 Subject: [PATCH] Added high-level implementations of 4m, 3m. Details: - Added "4mh" and "3mh" APIs, which implement the 4m and 3m methods at high levels, respectively. APIs for trmm and trsm were NOT added due to the fact that these approaches are inherently incompatible with implementing 4m or 3m at high levels (because the input right-hand side matrix is overwritten). - Added 4mh, 3mh virtual micro-kernels, and updated the existing 4m and 3m so that all are stylistically consistent. - Added new "rih" packing kernels (both low-level and structure-aware) to support both 4mh and 3mh. - Defined new pack_t schemas to support real-only, imaginary-only, and real+imaginary packing formats. - Added various level0 scalar macros to support the rih packm kernels. - Minor tweaks to trmm macro-kernels to facilitate 4mh and 3mh. - Added the ability to enable/disable 4mh, 3m, and 3mh, and adjusted level-3 front-ends to check enabledness of 3mh, 3m, 4mh, and 4m (in that order) and execute the first one that is enabled, or the native implementation if none are enabled. - Added implementation query functions for each level-3 operation so that the user can query a string that describes the implementation that is currently enabled. - Updated test suite to output implementation types for reach level-3 operation, as well as micro-kernel types for each of the five micro- kernels. - Renamed BLIS_ENABLE_?COMPLEX_VIA_4M macros to _ENABLE_VIRTUAL_?COMPLEX. - Fixed an obscure bug when packing Hermitian matrices (regular packing type) whereby the diagonal elements of the packed micro-panels could get tainted if the source matrix's imaginary diagonal part contained garbage. --- frame/1m/packm/bli_packm.h | 2 + frame/1m/packm/bli_packm_blk_var2.c | 4 + frame/1m/packm/bli_packm_cntl.c | 9 + frame/1m/packm/bli_packm_cxk_rih.c | 290 +++ frame/1m/packm/bli_packm_cxk_rih.h | 52 + frame/1m/packm/bli_packm_init.c | 8 + frame/1m/packm/bli_packm_struc_cxk.c | 51 +- frame/1m/packm/bli_packm_struc_cxk_rih.c | 532 +++++ frame/1m/packm/bli_packm_struc_cxk_rih.h | 110 + .../1m/packm/ukernels/bli_packm_ref_cxk_rih.c | 2082 +++++++++++++++++ .../1m/packm/ukernels/bli_packm_ref_cxk_rih.h | 55 + frame/3/gemm/3m/bli_gemm3m_cntl.c | 24 +- frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c | 278 ++- frame/3/gemm/3mh/bli_gemm3mh.c | 101 + frame/3/gemm/3mh/bli_gemm3mh.h | 71 + frame/3/gemm/3mh/bli_gemm3mh_cntl.c | 402 ++++ frame/3/gemm/3mh/bli_gemm3mh_cntl.h | 37 + frame/3/gemm/3mh/bli_gemm3mh_entry.c | 51 + frame/3/gemm/3mh/bli_gemm3mh_entry.h | 40 + .../3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c | 278 +++ .../3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h | 50 + frame/3/gemm/4m/bli_gemm4m_cntl.c | 24 +- frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c | 173 +- frame/3/gemm/4mh/bli_gemm4mh.c | 101 + frame/3/gemm/4mh/bli_gemm4mh.h | 71 + frame/3/gemm/4mh/bli_gemm4mh_cntl.c | 431 ++++ frame/3/gemm/4mh/bli_gemm4mh_cntl.h | 37 + frame/3/gemm/4mh/bli_gemm4mh_entry.c | 53 + frame/3/gemm/4mh/bli_gemm4mh_entry.h | 40 + .../3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c | 271 +++ .../3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h | 50 + frame/3/gemm/bli_gemm.c | 12 +- frame/3/gemm/bli_gemm.h | 3 + frame/3/gemm/bli_gemm_query.c | 86 + frame/3/gemm/bli_gemm_query.h | 38 + frame/3/hemm/3mh/bli_hemm3mh.c | 107 + frame/3/hemm/3mh/bli_hemm3mh.h | 69 + frame/3/hemm/3mh/bli_hemm3mh_entry.c | 52 + frame/3/hemm/3mh/bli_hemm3mh_entry.h | 41 + frame/3/hemm/4mh/bli_hemm4mh.c | 107 + frame/3/hemm/4mh/bli_hemm4mh.h | 69 + frame/3/hemm/4mh/bli_hemm4mh_entry.c | 54 + frame/3/hemm/4mh/bli_hemm4mh_entry.h | 41 + frame/3/hemm/bli_hemm.c | 12 +- frame/3/hemm/bli_hemm.h | 3 + frame/3/her2k/3mh/bli_her2k3mh.c | 105 + frame/3/her2k/3mh/bli_her2k3mh.h | 68 + frame/3/her2k/3mh/bli_her2k3mh_entry.c | 51 + frame/3/her2k/3mh/bli_her2k3mh_entry.h | 40 + frame/3/her2k/4mh/bli_her2k4mh.c | 105 + frame/3/her2k/4mh/bli_her2k4mh.h | 68 + frame/3/her2k/4mh/bli_her2k4mh_entry.c | 53 + frame/3/her2k/4mh/bli_her2k4mh_entry.h | 40 + frame/3/her2k/bli_her2k.c | 12 +- frame/3/her2k/bli_her2k.h | 2 + frame/3/herk/3mh/bli_herk3mh.c | 97 + frame/3/herk/3mh/bli_herk3mh.h | 65 + frame/3/herk/3mh/bli_herk3mh_entry.c | 50 + frame/3/herk/3mh/bli_herk3mh_entry.h | 39 + frame/3/herk/4mh/bli_herk4mh.c | 97 + frame/3/herk/4mh/bli_herk4mh.h | 65 + frame/3/herk/4mh/bli_herk4mh_entry.c | 52 + frame/3/herk/4mh/bli_herk4mh_entry.h | 39 + frame/3/herk/bli_herk.c | 12 +- frame/3/herk/bli_herk.h | 2 + frame/3/symm/3mh/bli_symm3mh.c | 107 + frame/3/symm/3mh/bli_symm3mh.h | 69 + frame/3/symm/3mh/bli_symm3mh_entry.c | 52 + frame/3/symm/3mh/bli_symm3mh_entry.h | 41 + frame/3/symm/4mh/bli_symm4mh.c | 107 + frame/3/symm/4mh/bli_symm4mh.h | 69 + frame/3/symm/4mh/bli_symm4mh_entry.c | 54 + frame/3/symm/4mh/bli_symm4mh_entry.h | 41 + frame/3/symm/bli_symm.c | 12 +- frame/3/symm/bli_symm.h | 3 + frame/3/syr2k/3mh/bli_syr2k3mh.c | 104 + frame/3/syr2k/3mh/bli_syr2k3mh.h | 68 + frame/3/syr2k/3mh/bli_syr2k3mh_entry.c | 51 + frame/3/syr2k/3mh/bli_syr2k3mh_entry.h | 40 + frame/3/syr2k/4mh/bli_syr2k4mh.c | 104 + frame/3/syr2k/4mh/bli_syr2k4mh.h | 68 + frame/3/syr2k/4mh/bli_syr2k4mh_entry.c | 53 + frame/3/syr2k/4mh/bli_syr2k4mh_entry.h | 40 + frame/3/syr2k/bli_syr2k.c | 12 +- frame/3/syr2k/bli_syr2k.h | 2 + frame/3/syrk/3mh/bli_syrk3mh.c | 96 + frame/3/syrk/3mh/bli_syrk3mh.h | 65 + frame/3/syrk/3mh/bli_syrk3mh_entry.c | 50 + frame/3/syrk/3mh/bli_syrk3mh_entry.h | 39 + frame/3/syrk/4mh/bli_syrk4mh.c | 96 + frame/3/syrk/4mh/bli_syrk4mh.h | 65 + frame/3/syrk/4mh/bli_syrk4mh_entry.c | 52 + frame/3/syrk/4mh/bli_syrk4mh_entry.h | 39 + frame/3/syrk/bli_syrk.c | 12 +- frame/3/syrk/bli_syrk.h | 2 + frame/3/trmm/bli_trmm.c | 10 +- frame/3/trmm/bli_trmm.h | 1 + frame/3/trmm/bli_trmm_ll_ker_var2.c | 3 +- frame/3/trmm/bli_trmm_lu_ker_var2.c | 3 +- frame/3/trmm/bli_trmm_query.c | 55 + frame/3/trmm/bli_trmm_query.h | 36 + frame/3/trmm/bli_trmm_rl_ker_var2.c | 3 +- frame/3/trmm/bli_trmm_ru_ker_var2.c | 3 +- frame/3/trmm3/3mh/bli_trmm33mh.c | 109 + frame/3/trmm3/3mh/bli_trmm33mh.h | 71 + frame/3/trmm3/3mh/bli_trmm33mh_entry.c | 52 + frame/3/trmm3/3mh/bli_trmm33mh_entry.h | 41 + frame/3/trmm3/4mh/bli_trmm34mh.c | 109 + frame/3/trmm3/4mh/bli_trmm34mh.h | 71 + frame/3/trmm3/4mh/bli_trmm34mh_entry.c | 54 + frame/3/trmm3/4mh/bli_trmm34mh_entry.h | 41 + frame/3/trmm3/bli_trmm3.c | 12 +- frame/3/trmm3/bli_trmm3.h | 2 + frame/3/trsm/3m/bli_trsm3m_cntl.c | 22 + frame/3/trsm/4m/bli_trsm4m_cntl.c | 21 + frame/3/trsm/bli_trsm.c | 10 +- frame/3/trsm/bli_trsm.h | 1 + frame/3/trsm/bli_trsm_cntl.c | 22 + frame/3/trsm/bli_trsm_query.c | 171 ++ frame/3/trsm/bli_trsm_query.h | 42 + frame/base/bli_3m.c | 75 + frame/base/bli_3m.h | 49 + frame/base/bli_3mh.c | 75 + frame/base/bli_3mh.h | 49 + frame/base/bli_4m.c | 23 +- frame/base/bli_4m.h | 13 +- frame/base/bli_4mh.c | 75 + frame/base/bli_4mh.h | 49 + frame/base/bli_info.c | 50 +- frame/base/bli_info.h | 23 + frame/cntl/bli_cntl_init.c | 12 + frame/include/bli_kernel_macro_defs.h | 5 +- frame/include/bli_kernel_pre_macro_defs.h | 58 + frame/include/bli_kernel_rih_macro_defs.h | 168 ++ frame/include/bli_obj_macro_defs.h | 18 + frame/include/bli_param_macro_defs.h | 21 +- frame/include/bli_scalar_macro_defs.h | 19 + frame/include/bli_type_defs.h | 32 + frame/include/blis.h | 4 + frame/include/level0/io/bli_scal2ios.h | 61 + frame/include/level0/io/bli_scal2jios.h | 52 + .../level0/rih/bli_scal2rihs_mxn_diag.h | 110 + .../level0/rih/bli_scal2rihs_mxn_uplo.h | 348 +++ .../include/level0/rih/bli_setrihs_mxn_diag.h | 110 + frame/include/level0/ro/bli_scal2jros.h | 51 + frame/include/level0/ro/bli_scal2ros.h | 62 + frame/include/level0/rpi/bli_scal2jrpis.h | 53 + frame/include/level0/rpi/bli_scal2rpis.h | 66 + testsuite/input.general | 5 + testsuite/src/test_libblis.c | 199 +- testsuite/src/test_libblis.h | 4 + 151 files changed, 11766 insertions(+), 285 deletions(-) create mode 100644 frame/1m/packm/bli_packm_cxk_rih.c create mode 100644 frame/1m/packm/bli_packm_cxk_rih.h create mode 100644 frame/1m/packm/bli_packm_struc_cxk_rih.c create mode 100644 frame/1m/packm/bli_packm_struc_cxk_rih.h create mode 100644 frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c create mode 100644 frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h create mode 100644 frame/3/gemm/3mh/bli_gemm3mh.c create mode 100644 frame/3/gemm/3mh/bli_gemm3mh.h create mode 100644 frame/3/gemm/3mh/bli_gemm3mh_cntl.c create mode 100644 frame/3/gemm/3mh/bli_gemm3mh_cntl.h create mode 100644 frame/3/gemm/3mh/bli_gemm3mh_entry.c create mode 100644 frame/3/gemm/3mh/bli_gemm3mh_entry.h create mode 100644 frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c create mode 100644 frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h create mode 100644 frame/3/gemm/4mh/bli_gemm4mh.c create mode 100644 frame/3/gemm/4mh/bli_gemm4mh.h create mode 100644 frame/3/gemm/4mh/bli_gemm4mh_cntl.c create mode 100644 frame/3/gemm/4mh/bli_gemm4mh_cntl.h create mode 100644 frame/3/gemm/4mh/bli_gemm4mh_entry.c create mode 100644 frame/3/gemm/4mh/bli_gemm4mh_entry.h create mode 100644 frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c create mode 100644 frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h create mode 100644 frame/3/gemm/bli_gemm_query.c create mode 100644 frame/3/gemm/bli_gemm_query.h create mode 100644 frame/3/hemm/3mh/bli_hemm3mh.c create mode 100644 frame/3/hemm/3mh/bli_hemm3mh.h create mode 100644 frame/3/hemm/3mh/bli_hemm3mh_entry.c create mode 100644 frame/3/hemm/3mh/bli_hemm3mh_entry.h create mode 100644 frame/3/hemm/4mh/bli_hemm4mh.c create mode 100644 frame/3/hemm/4mh/bli_hemm4mh.h create mode 100644 frame/3/hemm/4mh/bli_hemm4mh_entry.c create mode 100644 frame/3/hemm/4mh/bli_hemm4mh_entry.h create mode 100644 frame/3/her2k/3mh/bli_her2k3mh.c create mode 100644 frame/3/her2k/3mh/bli_her2k3mh.h create mode 100644 frame/3/her2k/3mh/bli_her2k3mh_entry.c create mode 100644 frame/3/her2k/3mh/bli_her2k3mh_entry.h create mode 100644 frame/3/her2k/4mh/bli_her2k4mh.c create mode 100644 frame/3/her2k/4mh/bli_her2k4mh.h create mode 100644 frame/3/her2k/4mh/bli_her2k4mh_entry.c create mode 100644 frame/3/her2k/4mh/bli_her2k4mh_entry.h create mode 100644 frame/3/herk/3mh/bli_herk3mh.c create mode 100644 frame/3/herk/3mh/bli_herk3mh.h create mode 100644 frame/3/herk/3mh/bli_herk3mh_entry.c create mode 100644 frame/3/herk/3mh/bli_herk3mh_entry.h create mode 100644 frame/3/herk/4mh/bli_herk4mh.c create mode 100644 frame/3/herk/4mh/bli_herk4mh.h create mode 100644 frame/3/herk/4mh/bli_herk4mh_entry.c create mode 100644 frame/3/herk/4mh/bli_herk4mh_entry.h create mode 100644 frame/3/symm/3mh/bli_symm3mh.c create mode 100644 frame/3/symm/3mh/bli_symm3mh.h create mode 100644 frame/3/symm/3mh/bli_symm3mh_entry.c create mode 100644 frame/3/symm/3mh/bli_symm3mh_entry.h create mode 100644 frame/3/symm/4mh/bli_symm4mh.c create mode 100644 frame/3/symm/4mh/bli_symm4mh.h create mode 100644 frame/3/symm/4mh/bli_symm4mh_entry.c create mode 100644 frame/3/symm/4mh/bli_symm4mh_entry.h create mode 100644 frame/3/syr2k/3mh/bli_syr2k3mh.c create mode 100644 frame/3/syr2k/3mh/bli_syr2k3mh.h create mode 100644 frame/3/syr2k/3mh/bli_syr2k3mh_entry.c create mode 100644 frame/3/syr2k/3mh/bli_syr2k3mh_entry.h create mode 100644 frame/3/syr2k/4mh/bli_syr2k4mh.c create mode 100644 frame/3/syr2k/4mh/bli_syr2k4mh.h create mode 100644 frame/3/syr2k/4mh/bli_syr2k4mh_entry.c create mode 100644 frame/3/syr2k/4mh/bli_syr2k4mh_entry.h create mode 100644 frame/3/syrk/3mh/bli_syrk3mh.c create mode 100644 frame/3/syrk/3mh/bli_syrk3mh.h create mode 100644 frame/3/syrk/3mh/bli_syrk3mh_entry.c create mode 100644 frame/3/syrk/3mh/bli_syrk3mh_entry.h create mode 100644 frame/3/syrk/4mh/bli_syrk4mh.c create mode 100644 frame/3/syrk/4mh/bli_syrk4mh.h create mode 100644 frame/3/syrk/4mh/bli_syrk4mh_entry.c create mode 100644 frame/3/syrk/4mh/bli_syrk4mh_entry.h create mode 100644 frame/3/trmm/bli_trmm_query.c create mode 100644 frame/3/trmm/bli_trmm_query.h create mode 100644 frame/3/trmm3/3mh/bli_trmm33mh.c create mode 100644 frame/3/trmm3/3mh/bli_trmm33mh.h create mode 100644 frame/3/trmm3/3mh/bli_trmm33mh_entry.c create mode 100644 frame/3/trmm3/3mh/bli_trmm33mh_entry.h create mode 100644 frame/3/trmm3/4mh/bli_trmm34mh.c create mode 100644 frame/3/trmm3/4mh/bli_trmm34mh.h create mode 100644 frame/3/trmm3/4mh/bli_trmm34mh_entry.c create mode 100644 frame/3/trmm3/4mh/bli_trmm34mh_entry.h create mode 100644 frame/3/trsm/bli_trsm_query.c create mode 100644 frame/3/trsm/bli_trsm_query.h create mode 100644 frame/base/bli_3m.c create mode 100644 frame/base/bli_3m.h create mode 100644 frame/base/bli_3mh.c create mode 100644 frame/base/bli_3mh.h create mode 100644 frame/base/bli_4mh.c create mode 100644 frame/base/bli_4mh.h create mode 100644 frame/include/bli_kernel_rih_macro_defs.h create mode 100644 frame/include/level0/io/bli_scal2ios.h create mode 100644 frame/include/level0/io/bli_scal2jios.h create mode 100644 frame/include/level0/rih/bli_scal2rihs_mxn_diag.h create mode 100644 frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h create mode 100644 frame/include/level0/rih/bli_setrihs_mxn_diag.h create mode 100644 frame/include/level0/ro/bli_scal2jros.h create mode 100644 frame/include/level0/ro/bli_scal2ros.h create mode 100644 frame/include/level0/rpi/bli_scal2jrpis.h create mode 100644 frame/include/level0/rpi/bli_scal2rpis.h diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index fbab40f5b..6fecc035c 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -47,8 +47,10 @@ #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_4m.h" #include "bli_packm_struc_cxk_3m.h" +#include "bli_packm_struc_cxk_rih.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4m.h" #include "bli_packm_cxk_3m.h" +#include "bli_packm_cxk_rih.h" diff --git a/frame/1m/packm/bli_packm_blk_var2.c b/frame/1m/packm/bli_packm_blk_var2.c index 39c2e1179..5240c60c0 100644 --- a/frame/1m/packm/bli_packm_blk_var2.c +++ b/frame/1m/packm/bli_packm_blk_var2.c @@ -63,6 +63,7 @@ typedef void (*FUNCPTR_T)( extern func_t* packm_struc_cxk_kers; extern func_t* packm_struc_cxk_4m_kers; extern func_t* packm_struc_cxk_3m_kers; +extern func_t* packm_struc_cxk_rih_kers; void bli_packm_blk_var2( obj_t* c, @@ -153,6 +154,9 @@ void bli_packm_blk_var2( obj_t* c, // Choose the correct func_t object based on the pack_t schema. if ( bli_is_4m_packed( schema ) ) packm_kers = packm_struc_cxk_4m_kers; else if ( bli_is_3m_packed( schema ) ) packm_kers = packm_struc_cxk_3m_kers; + else if ( bli_is_ro_packed( schema ) || + bli_is_io_packed( schema ) || + bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; // Query the datatype-specific function pointer from the func_t object. diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 37e29708d..0147cc2f1 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -40,6 +40,7 @@ blksz_t* packm_mult_nvec; func_t* packm_struc_cxk_kers; func_t* packm_struc_cxk_4m_kers; func_t* packm_struc_cxk_3m_kers; +func_t* packm_struc_cxk_rih_kers; packm_t* packm_cntl_row; packm_t* packm_cntl_col; @@ -74,6 +75,13 @@ void bli_packm_cntl_init() bli_cpackm_struc_cxk_3m, FALSE, bli_zpackm_struc_cxk_3m, FALSE ); + packm_struc_cxk_rih_kers + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + bli_cpackm_struc_cxk_rih, FALSE, + bli_zpackm_struc_cxk_rih, FALSE ); + // Create blocksize objects for m and n register blocking. We will attach // these to the packm control node so they can be used to (a) allocate a @@ -146,6 +154,7 @@ void bli_packm_cntl_finalize() bli_func_obj_free( packm_struc_cxk_kers ); bli_func_obj_free( packm_struc_cxk_4m_kers ); bli_func_obj_free( packm_struc_cxk_3m_kers ); + bli_func_obj_free( packm_struc_cxk_rih_kers ); bli_cntl_obj_free( packm_cntl_row ); bli_cntl_obj_free( packm_cntl_col ); diff --git a/frame/1m/packm/bli_packm_cxk_rih.c b/frame/1m/packm/bli_packm_cxk_rih.c new file mode 100644 index 000000000..21b711e64 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_rih.c @@ -0,0 +1,290 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_fp + +typedef void (*FUNCPTR_T)( + conj_t conja, + pack_t schema, + dim_t panel_len, + void* kappa, + void* a, inc_t inca, inc_t lda, + void* p, inc_t ldp + ); + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 + +static FUNCPTR_T ftypes_rih[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_2XK_RIH_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_4XK_RIH_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_6XK_RIH_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_8XK_RIH_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_10XK_RIH_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_12XK_RIH_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_14XK_RIH_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_RIH_KERNEL, + NULL, BLIS_ZPACKM_16XK_RIH_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the micro-panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes_rih[panel_dim][dt]; \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f( conja, \ + schema, \ + panel_len, \ + kappa, \ + a, inca, lda, \ + p, ldp ); \ + } \ + else \ + { \ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict a_r = ( ctype* )a; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + const dim_t inca1 = inca; \ + const dim_t lda1 = lda; \ + const dim_t ldp1 = ldp; \ + dim_t i, j; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jros)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2ros)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jios)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2ios)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11 = a_r + (i )*inca1 + (j )*lda1; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp1; \ +\ + PASTEMAC(ch,scal2rpis)( *kappa_cast, \ + *alpha11, \ + *pi11_r ); \ + } \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_cxk_rih.h b/frame/1m/packm/bli_packm_cxk_rih.h new file mode 100644 index 000000000..5106b7b03 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_rih.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_ref_cxk_rih.h" + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 1565b8921..876920691 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -332,6 +332,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3m_packed( pack_schema ) ) ps_p = ( ps_p * 3 ) / 2; + else if ( bli_is_ro_packed( pack_schema ) || + bli_is_io_packed( pack_schema ) || + bli_is_rpi_packed( pack_schema ) ) + ps_p = ps_p / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); @@ -373,6 +377,10 @@ void bli_packm_init_pack( invdiag_t invert_diag, if ( bli_is_3m_packed( pack_schema ) ) ps_p = ( ps_p * 3 ) / 2; + else if ( bli_is_ro_packed( pack_schema ) || + bli_is_io_packed( pack_schema ) || + bli_is_rpi_packed( pack_schema ) ) + ps_p = ps_p / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c index fcdfd943f..37aef3d6b 100644 --- a/frame/1m/packm/bli_packm_struc_cxk.c +++ b/frame/1m/packm/bli_packm_struc_cxk.c @@ -389,41 +389,46 @@ void PASTEMAC(ch,varname)( \ \ /* Pack the stored triangle of c11 to p11. */ \ { \ - ctype* restrict c11; \ - ctype* restrict p11; \ - dim_t p11_m; \ - dim_t p11_n; \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + dim_t j = diagoffc_abs; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype* restrict p11 = p + (j )*ldp; \ \ - p11_m = panel_dim; \ - p11_n = panel_dim; \ - j = diagoffc_abs; \ - p11 = p + (j )*ldp; \ - c11 = c + (j )*ldc; \ -\ - PASTEMAC(ch,scal2m)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - conjc, \ - p11_m, \ - p11_n, \ - kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p, cs_p ); \ + PASTEMAC(ch,copym)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + conjc, \ + p11_m, \ + p11_n, \ + c11, rs_c, cs_c, \ + p11, rs_p, cs_p ); \ \ /* If source matrix c is Hermitian, we have to zero out the imaginary components of the diagonal of p11 in case the corresponding elements in c11 were not already zero. */ \ if ( bli_is_hermitian( strucc ) ) \ { \ - /* NOTE: We can directly increment p11 since we are done - using p11 for the remainder of the function. */ \ + ctype* restrict pi11 = p11; \ +\ for ( i = 0; i < p11_m; ++i ) \ { \ - PASTEMAC(ch,seti0s)( *p11 ); \ + PASTEMAC(ch,seti0s)( *pi11 ); \ \ - p11 += rs_p + cs_p; \ + pi11 += rs_p + cs_p; \ } \ } \ +\ + /* Now that the diagonal has been made explicitly Hermitian + (if applicable), we can now safely scale the stored + triangle specified by uploc. */ \ + PASTEMAC(ch,scalm)( BLIS_NO_CONJUGATE, \ + 0, \ + uploc, \ + p11_m, \ + p11_n, \ + kappa, \ + p11, rs_p, cs_p ); \ } \ } \ } diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.c b/frame/1m/packm/bli_packm_struc_cxk_rih.c new file mode 100644 index 000000000..2a15e7cc1 --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.c @@ -0,0 +1,532 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_rih)( strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + schema, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_rih)( strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + } \ +\ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + /* We don't need this case if we aren't supporting trsm. + Why? Because trmm's packm control tree node should be + using k dimension multiples of 1 (kr == 1), which means + there will never be zero padding at the far end of a + micro-panel. */ \ + } \ + } \ +\ +\ +/* + { \ + if ( bli_is_col_packed( schema ) ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", m_panel_max, n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + else if ( bli_is_row_packed( schema ) ) \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", m_panel_max, n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ + \ +\ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_rih, packm_cxk_rih ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + bool_t row_stored; \ + bool_t col_stored; \ + doff_t diagoffc_abs; \ + dim_t j; \ +\ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername)( conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t j = diagoffc_abs; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype_r* restrict p11_r = p_r + (j )*ldp; \ +\ + PASTEMAC(ch,scal2rihs_mxn_uplo)( schema, \ + uploc, \ + conjc, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11_r, rs_p, cs_p ); \ +\ + /* If we are packing a micro-panel with Hermitian structure, + we must take special care of the diagonal. Now, if kappa + were guaranteed to be unit, all we would need to do is + explicitly zero out the imaginary part of the diagonal of + p11, in case the diagonal of the source matrix contained + garbage (non-zero) imaginary values. HOWEVER, since kappa + can be non-unit, things become a little more complicated. + In general, we must re-apply the kappa scalar to ONLY the + real part of the diagonal of the source matrix and save + the result to the diagonal of p11. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + PASTEMAC3(ch,chr,ch,scal2rihs_mxn_diag)( schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11_r, rs_p, cs_p ); \ + } \ +\ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ + p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ + p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih ) + + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ) \ +{ \ + /* Pack the panel. */ \ + PASTEMAC(ch,kername)( conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + ctype_r* p_r = ( ctype_r* )p; \ +\ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = p_r + (j )*ldp; \ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,setrihs_mxn_diag)( schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + p11_r, rs_p, cs_p ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + /* We don't need this case if we aren't supporting trsm. */ \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). */ \ + { \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + uplo_t uplop = uploc; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ +\ + PASTEMAC(chr,setm)( diagoffp, \ + BLIS_NONUNIT_DIAG, \ + uplop, \ + m_panel, \ + n_panel, \ + zero_r, \ + p_r, rs_p, cs_p ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_rih, packm_cxk_rih ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_rih.h b/frame/1m/packm/bli_packm_struc_cxk_rih.h new file mode 100644 index 000000000..87e5dcead --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_rih.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_rih ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_rih ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_rih ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c new file mode 100644 index 000000000..5b52d8adb --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.c @@ -0,0 +1,2082 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_rih ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* kappa_cast = kappa; \ + ctype* restrict alpha1 = a; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ +\ +\ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + /* This works regardless of conja since we are only copying + the real part. */ \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_r + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_r + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( *(alpha1_r +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jros)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2ros)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( -*(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,copys)( *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,copys)( *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,copys)( *(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jios)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2ios)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), -*(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), -*(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), -*(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), -*(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), -*(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), -*(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), -*(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), -*(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), -*(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), -*(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), -*(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), -*(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), -*(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), -*(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +14*inca2), -*(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +15*inca2), -*(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(chr,add3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14) ); \ + PASTEMAC(chr,add3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2jrpis)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 0*inca), *(pi1_r + 0) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 1*inca), *(pi1_r + 1) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 2*inca), *(pi1_r + 2) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 3*inca), *(pi1_r + 3) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 4*inca), *(pi1_r + 4) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 5*inca), *(pi1_r + 5) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 6*inca), *(pi1_r + 6) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 7*inca), *(pi1_r + 7) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 8*inca), *(pi1_r + 8) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 + 9*inca), *(pi1_r + 9) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +10*inca), *(pi1_r +10) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +11*inca), *(pi1_r +11) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +12*inca), *(pi1_r +12) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +13*inca), *(pi1_r +13) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +14*inca), *(pi1_r +14) ); \ + PASTEMAC(ch,scal2rpis)( *kappa_cast, *(alpha1 +15*inca), *(pi1_r +15) ); \ +\ + alpha1 += lda; \ + pi1_r += ldp; \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_rih ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h new file mode 100644 index 000000000..d537aa8d5 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_ref_cxk_rih.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + pack_t schema, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp \ + ); + +INSERT_GENTPROT_BASIC( packm_ref_2xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_4xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_6xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_8xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_10xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_12xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_14xk_rih ) +INSERT_GENTPROT_BASIC( packm_ref_16xk_rih ) + diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c index 081e54e63..0988c87f8 100644 --- a/frame/3/gemm/3m/bli_gemm3m_cntl.c +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -70,38 +70,38 @@ void bli_gemm3m_cntl_init() // blocksizes. gemm3m_mc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm3m_nc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm3m_kc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm3m_mr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm3m_nr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm3m_kr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); diff --git a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c index 6336a32d8..2e5f8c91c 100644 --- a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c +++ b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c @@ -47,25 +47,17 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ) \ { \ - ctype_r ct_r[ PASTEMAC(chr,mr) * \ - PASTEMAC(chr,nr) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ct_i[ PASTEMAC(chr,mr) * \ - PASTEMAC(chr,nr) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = PASTEMAC(chr,mr); \ -\ -\ ctype_r ab_r[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ ctype_r ab_i[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = PASTEMAC(chr,mr); \ -\ + ctype_r ab_rpi[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ab; \ + inc_t cs_ab; \ \ const dim_t m = PASTEMAC(chr,mr); \ const dim_t n = PASTEMAC(chr,nr); \ @@ -75,29 +67,28 @@ void PASTEMAC(ch,varname)( \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \ - ctype_r* restrict a_ri = ( ctype_r* )a + 2*ps_a; \ + ctype_r* restrict a_rpi = ( ctype_r* )a + 2*ps_a; \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*ps_b; \ + ctype_r* restrict b_rpi = ( ctype_r* )b + 2*ps_b; \ \ - ctype_r* restrict c_r = ( ctype_r* )c; \ - ctype_r* restrict c_i = ( ctype_r* )c + 1; \ -\ - const inc_t rs_c2 = 2 * rs_c; \ - const inc_t cs_c2 = 2 * cs_c; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ \ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incab, ldab; \ \ dim_t i, j; \ \ @@ -106,64 +97,54 @@ void PASTEMAC(ch,varname)( \ allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 3m method. If alpha is not real, then something is very wrong. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ \ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ { \ - /* We can handle a non-zero imaginary component on beta, but to do - so we have to manually scale c and then use beta == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scal2ris)( beta_r, \ - beta_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ -\ - /* Use beta.r == 1.0. */ \ - beta_r = *one_r; \ + rs_ab = n; n_iter = m; incc = cs_c; \ + cs_ab = 1; n_elem = n; ldc = rs_c; \ } \ - else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + else /* column-stored or general stride */ \ { \ - /* Copy c to ct without scaling. */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ - } \ - else \ - { \ - /* Since beta is zero, ct can remain uninitialized since it - will be overwritten by the micro-kernel. */ \ + rs_ab = 1; n_iter = n; incc = rs_c; \ + cs_ab = m; n_elem = m; ldc = cs_c; \ } \ + incab = 1; \ + ldab = n_elem; \ \ \ - /* c.r = beta.r * c.r + a.r * b.r - a.i * b.i; - c.i = beta.r * c.i + (a.r + a.i)(b.r + b.i) - a.r * b.r - a.i * b.i; */ \ + /* The following gemm micro-kernel calls implement all "phases" of the + 3m method: + + c = beta * c; + c_r += + a_r * b_r - a_i * b_i; + c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* ab.r = alpha.r * a.r * b.r; */ \ + /* ab_r = alpha_r * a_r * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_r, \ zero_r, \ ab_r, rs_ab, cs_ab, \ data ); \ \ - bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \ + bli_auxinfo_set_next_ab( a_rpi, b_rpi, *data ); \ \ - /* ab.i = alpha.r * a.i * b.i; */ \ + /* ab_i = alpha_r * a_i * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_i, \ b_i, \ zero_r, \ @@ -172,47 +153,158 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* ct.i = alpha.r * a.ri * b.ri; */ \ + /* ct_i = alpha_r * a_ri * b_ri; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ - a_ri, \ - b_ri, \ - &beta_r, \ - ct_i, rs_ct, cs_ct, \ + alpha_r, \ + a_rpi, \ + b_rpi, \ + zero_r, \ + ab_rpi, rs_ab, cs_ab, \ data ); \ \ \ - /* ct.r = beta.r * ct.r + ab.r; - ct.r = ct.r - ab.i; - ct.i = ct.i - ab.r; - ct.i = ct.i - ab.i; */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ + /* How we accumulate the intermediate matrix products stored in ab_r, + ab_i, and ab_rpi depends on the value of beta. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \ - ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \ -\ - if ( PASTEMAC(ch,eq0)( *beta ) ) \ + /* c = beta * c; + c_r = c_r + ab_r - ab_i; + c_i = c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC(chr,copys)( *zero_r, gammat_r ); \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,xpbyris)( gamma11t_r, \ + gamma11t_i, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ } \ - else \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ab_r - ab_i; + c_i = c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ { \ - PASTEMAC(chr,scals)( beta_r, gammat_r ); \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,addris)( gamma11t_r, \ + gamma11t_i, \ + *gamma11_r, \ + *gamma11_i ); \ } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ab_r - ab_i; + c_i = beta_r * c_i + ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ \ - PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \ - PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \ - PASTEMAC(chr,subs)( alphabeta_r, gammat_i ); \ - PASTEMAC(chr,subs)( alphabeta_i, gammat_i ); \ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ \ - /* Store the local values (from ct) back to c. */ \ - PASTEMAC(ch,copyris)( gammat_r, \ - gammat_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if ( PASTEMAC(chr,eq0)( beta_r ) ) */ \ + { \ + /* c_r = ab_r - ab_i; + c_i = ab_rpi - ab_r - ab_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \ + const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \ + const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ + ctype_r gamma11t_r; \ + ctype_r gamma11t_i; \ +\ + PASTEMAC(ch,copyris)( alphabeta11_r, \ + -alphabeta11_r, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,subris)( alphabeta11_i, \ + alphabeta11_i, \ + gamma11t_r, \ + gamma11t_i ); \ +\ + PASTEMAC(chr,adds)( alphabeta11_rpi, \ + gamma11t_i ); \ +\ + PASTEMAC(ch,copyris)( gamma11t_r, \ + gamma11t_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ } \ } diff --git a/frame/3/gemm/3mh/bli_gemm3mh.c b/frame/3/gemm/3mh/bli_gemm3mh.c new file mode 100644 index 000000000..b7450bd1a --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh.c @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_gemm3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_gemm3mh_entry( alpha, a, b, beta, c ); + else + bli_gemm_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm3mh, gemm3mh ) + diff --git a/frame/3/gemm/3mh/bli_gemm3mh.h b/frame/3/gemm/3mh/bli_gemm3mh.h new file mode 100644 index 000000000..0d236a39e --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm3mh_cntl.h" +#include "bli_gemm3mh_entry.h" + +#include "bli_gemm3mh_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm3mh ) + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_cntl.c b/frame/3/gemm/3mh/bli_gemm3mh_cntl.c new file mode 100644 index 000000000..ec81d5b2d --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_cntl.c @@ -0,0 +1,402 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm3mh_mc; +blksz_t* gemm3mh_nc; +blksz_t* gemm3mh_kc; +blksz_t* gemm3mh_mr; +blksz_t* gemm3mh_nr; +blksz_t* gemm3mh_kr; + +func_t* gemm3mh_ukrs; + +packm_t* gemm3mh_packa_cntl_ro; +packm_t* gemm3mh_packb_cntl_ro; +packm_t* gemm3mh_packa_cntl_io; +packm_t* gemm3mh_packb_cntl_io; +packm_t* gemm3mh_packa_cntl_rpi; +packm_t* gemm3mh_packb_cntl_rpi; + +gemm_t* gemm3mh_cntl_bp_ke; +gemm_t* gemm3mh_cntl_op_bp_ro; +gemm_t* gemm3mh_cntl_mm_op_ro; +gemm_t* gemm3mh_cntl_vl_mm_ro; +gemm_t* gemm3mh_cntl_op_bp_io; +gemm_t* gemm3mh_cntl_mm_op_io; +gemm_t* gemm3mh_cntl_vl_mm_io; +gemm_t* gemm3mh_cntl_op_bp_rpi; +gemm_t* gemm3mh_cntl_mm_op_rpi; +gemm_t* gemm3mh_cntl_vl_mm_rpi; + +gemm_t* gemm3mh_cntl_ro; +gemm_t* gemm3mh_cntl_io; +gemm_t* gemm3mh_cntl_rpi; + + +void bli_gemm3mh_cntl_init() +{ + // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 3mh are equal to their + // corresponding real domain counterparts. + gemm3mh_mc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); + gemm3mh_nc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); + gemm3mh_kc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, + BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); + gemm3mh_mr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); + gemm3mh_nr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); + gemm3mh_kr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); + + + // Attach the register blksz_t objects as sub-blocksizes to the cache + // blksz_t objects. + bli_blksz_obj_attach_to( gemm3mh_mr, gemm3mh_mc ); + bli_blksz_obj_attach_to( gemm3mh_nr, gemm3mh_nc ); + bli_blksz_obj_attach_to( gemm3mh_kr, gemm3mh_kc ); + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm3mh_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM3MH_UKERNEL, BLIS_CGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM3MH_UKERNEL, BLIS_ZGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS ); + + + // Create control tree objects for packm operations (real only). + gemm3mh_packa_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (imag only). + gemm3mh_packa_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_IO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_IO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (real+imag). + gemm3mh_packa_cntl_rpi + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_mr, + gemm3mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RPI, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3mh_packb_cntl_rpi + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_kr, + gemm3mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RPI, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + gemm3mh_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3mh_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // + // Create control tree for A.real * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x real) + gemm3mh_cntl_op_bp_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_ro, + gemm3mh_packb_cntl_ro, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x real) + gemm3mh_cntl_mm_op_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_ro, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x real) + gemm3mh_cntl_vl_mm_ro + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_ro, + NULL ); + + // + // Create control tree for A.imag * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x imag) + gemm3mh_cntl_op_bp_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_io, + gemm3mh_packb_cntl_io, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x imag) + gemm3mh_cntl_mm_op_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_io, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x imag) + gemm3mh_cntl_vl_mm_io + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_io, + NULL ); + + // + // Create control tree for (A.real + A.imag) * (B.real + B.imag). + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real+imag x real+imag) + gemm3mh_cntl_op_bp_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3mh_mc, + gemm3mh_ukrs, + NULL, + gemm3mh_packa_cntl_rpi, + gemm3mh_packb_cntl_rpi, + NULL, + gemm3mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real+imag x real+imag) + gemm3mh_cntl_mm_op_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3mh_kc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_op_bp_rpi, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real+imag x real+imag) + gemm3mh_cntl_vl_mm_rpi + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3mh_nc, + gemm3mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm3mh_cntl_mm_op_rpi, + NULL ); + + // Alias the "master" gemm control tree to a shorter name. + gemm3mh_cntl_ro = gemm3mh_cntl_vl_mm_ro; + gemm3mh_cntl_io = gemm3mh_cntl_vl_mm_io; + gemm3mh_cntl_rpi = gemm3mh_cntl_vl_mm_rpi; + +} + +void bli_gemm3mh_cntl_finalize() +{ + bli_blksz_obj_free( gemm3mh_mc ); + bli_blksz_obj_free( gemm3mh_nc ); + bli_blksz_obj_free( gemm3mh_kc ); + bli_blksz_obj_free( gemm3mh_mr ); + bli_blksz_obj_free( gemm3mh_nr ); + bli_blksz_obj_free( gemm3mh_kr ); + + bli_func_obj_free( gemm3mh_ukrs ); + + bli_cntl_obj_free( gemm3mh_packa_cntl_ro ); + bli_cntl_obj_free( gemm3mh_packb_cntl_ro ); + bli_cntl_obj_free( gemm3mh_packa_cntl_io ); + bli_cntl_obj_free( gemm3mh_packb_cntl_io ); + bli_cntl_obj_free( gemm3mh_packa_cntl_rpi ); + bli_cntl_obj_free( gemm3mh_packb_cntl_rpi ); + + bli_cntl_obj_free( gemm3mh_cntl_bp_ke ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_ro ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_ro ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_ro ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_io ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_io ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_io ); + bli_cntl_obj_free( gemm3mh_cntl_op_bp_rpi ); + bli_cntl_obj_free( gemm3mh_cntl_mm_op_rpi ); + bli_cntl_obj_free( gemm3mh_cntl_vl_mm_rpi ); + +} + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_cntl.h b/frame/3/gemm/3mh/bli_gemm3mh_cntl.h new file mode 100644 index 000000000..0d3fc6d49 --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm3mh_cntl_init( void ); +void bli_gemm3mh_cntl_finalize( void ); + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_entry.c b/frame/3/gemm/3mh/bli_gemm3mh_entry.c new file mode 100644 index 000000000..3ae00de3d --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_gemm3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_gemm_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/gemm/3mh/bli_gemm3mh_entry.h b/frame/3/gemm/3mh/bli_gemm3mh_entry.h new file mode 100644 index 000000000..9c200db67 --- /dev/null +++ b/frame/3/gemm/3mh/bli_gemm3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c new file mode 100644 index 000000000..3d74f2234 --- /dev/null +++ b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.c @@ -0,0 +1,278 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + ctype_r* restrict a_cast = ( ctype_r* )a; \ +\ + ctype_r* restrict b_cast = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + const pack_t schema = bli_auxinfo_schema_a( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 3mh method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel call implements one "phase" of the + 3m method: + + c = beta * c; + c_r += + a_r * b_r - a_i * b_i; + c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ +\ + /* ct = alpha_r * a * b; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_cast, \ + b_cast, \ + zero_r, \ + ct, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* How we accumulate the intermediate matrix product stored in ct + depends on (a) the schemas of A and B (they are always the same), + and (b) the value of beta. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + { \ + /* c = beta * c; + c_r = c_r + ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(ch,xpbyris)( gamma11t, \ + -gamma11t, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct; + c_i = beta_r * c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( -gamma11t, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct; + c_i = -ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r - ct; + c_i = c_i - ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = -ct; + c_i = -ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + 0; + c_i = c_i + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = 0; + c_i = ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,set0s)( *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \ + } \ + } \ + } \ +\ +\ +/*PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: b1", k, n, b_cast, n, 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: a1", m, k, a_cast, 1, m, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNCCO_BASIC( gemm3mh_ukr_ref, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h new file mode 100644 index 000000000..5b34fff0d --- /dev/null +++ b/frame/3/gemm/3mh/ukernels/bli_gemm3mh_ukr_ref.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( gemm3mh_ukr_ref ) + diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c index 630a55753..ab0943221 100644 --- a/frame/3/gemm/4m/bli_gemm4m_cntl.c +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -67,38 +67,38 @@ void bli_gemm4m_cntl_init() // parts), we reduce KC by a factor of 2 to compensate. gemm4m_mc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); gemm4m_nc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); gemm4m_kc = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KC_S/2, BLIS_MAXIMUM_KC_S/2, BLIS_DEFAULT_KC_D/2, BLIS_MAXIMUM_KC_D/2 ); gemm4m_mr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); gemm4m_nr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); gemm4m_kr = - bli_blksz_obj_create( 0, 0, - 0, 0, + bli_blksz_obj_create( 0, 0, + 0, 0, BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); diff --git a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c index e4c171c2f..078cc2a77 100644 --- a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c +++ b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c @@ -53,9 +53,8 @@ void PASTEMAC(ch,varname)( \ ctype_r ct_i[ PASTEMAC(chr,mr) * \ PASTEMAC(chr,nr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = PASTEMAC(chr,mr); \ -\ + inc_t rs_ct; \ + inc_t cs_ct; \ \ const dim_t m = PASTEMAC(chr,mr); \ const dim_t n = PASTEMAC(chr,nr); \ @@ -68,23 +67,26 @@ void PASTEMAC(ch,varname)( \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ -\ - ctype_r* restrict c_r = ( ctype_r* )c; \ - ctype_r* restrict c_i = ( ctype_r* )c + 1; \ -\ - const inc_t rs_c2 = 2 * rs_c; \ - const inc_t cs_c2 = 2 * cs_c; \ \ ctype_r* restrict one_r = PASTEMAC(chr,1); \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \ + ctype_r m_alpha_r = -(*alpha_r); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ \ void* a_next = bli_auxinfo_next_a( data ); \ void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ \ dim_t i, j; \ \ @@ -93,20 +95,43 @@ void PASTEMAC(ch,varname)( \ allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 4m method. If alpha is not real, then something is very wrong. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ \ - /* c.r = beta.r * c.r + alpha.r * a.r * b.r - - alpha.r * a.i * b.i; - c.i = beta.r * c.i + alpha.r * a.r * b.i - + alpha.r * a.i * b.r; */ \ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel calls implement all "phases" of + the 4m method: + + c = beta * c; + c_r += a_r * b_r - a_i * b_i; + c_i += a_r * b_i + a_i * b_r; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ \ bli_auxinfo_set_next_ab( a_r, b_i, *data ); \ \ - /* ct.r = alpha.r * a.r * b.r; */ \ + /* ct_r = alpha_r * a_r * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_r, \ zero_r, \ @@ -115,9 +140,9 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_i, b_r, *data ); \ \ - /* ct.i = alpha.r * a.r * b.i; */ \ + /* ct_i = alpha_r * a_r * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_r, \ b_i, \ zero_r, \ @@ -126,9 +151,9 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* ct.i += alpha.r * a.i * b.r; */ \ + /* ct_i += alpha_r * a_i * b_r; */ \ PASTEMAC(chr,gemmukr)( k, \ - &alpha_r, \ + alpha_r, \ a_i, \ b_r, \ one_r, \ @@ -137,7 +162,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* ct.r += -alpha.r * a.i * b.i; */ \ + /* ct_r += -alpha_r * a_i * b_i; */ \ PASTEMAC(chr,gemmukr)( k, \ &m_alpha_r, \ a_i, \ @@ -147,38 +172,78 @@ void PASTEMAC(ch,varname)( \ data ); \ \ \ - /* Accumulate the final result in ct back to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ + /* How we accumulate the intermediate matrix product stored in ct_r + and ct_i depends on the value of beta. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,addris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ - } \ - else \ - { \ - ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ + /* c = beta * c + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,xpbyris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - beta_r, \ - beta_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ + PASTEMAC(ch,xpbyris)( gamma11t_r, \ + gamma11t_i, \ + beta_r, \ + beta_i, \ + *gamma11_r, \ + *gamma11_i ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct_r; */ \ + /* c_i = c_i + ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t_r, *gamma11_r ); \ + PASTEMAC(chr,adds)( gamma11t_i, *gamma11_i ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct_r; */ \ + /* c_i = beta_r * c_i + ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \ + PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct_r; */ \ + /* c_i = ct_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \ + const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t_r, *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t_i, *gamma11_i ); \ + } \ } \ } diff --git a/frame/3/gemm/4mh/bli_gemm4mh.c b/frame/3/gemm/4mh/bli_gemm4mh.c new file mode 100644 index 000000000..c5d2b3fcb --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh.c @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_gemm4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_gemm4mh_entry( alpha, a, b, beta, c ); + else + bli_gemm_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm4mh, gemm4mh ) + diff --git a/frame/3/gemm/4mh/bli_gemm4mh.h b/frame/3/gemm/4mh/bli_gemm4mh.h new file mode 100644 index 000000000..54fb0ece0 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm4mh_cntl.h" +#include "bli_gemm4mh_entry.h" + +#include "bli_gemm4mh_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm4mh ) + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_cntl.c b/frame/3/gemm/4mh/bli_gemm4mh_cntl.c new file mode 100644 index 000000000..a29a0292c --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_cntl.c @@ -0,0 +1,431 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm4mh_mc; +blksz_t* gemm4mh_nc; +blksz_t* gemm4mh_kc; +blksz_t* gemm4mh_mr; +blksz_t* gemm4mh_nr; +blksz_t* gemm4mh_kr; + +func_t* gemm4mh_ukrs; + +packm_t* gemm4mh_packa_cntl_ro; +packm_t* gemm4mh_packb_cntl_ro; +packm_t* gemm4mh_packa_cntl_io; +packm_t* gemm4mh_packb_cntl_io; + +gemm_t* gemm4mh_cntl_bp_ke; +gemm_t* gemm4mh_cntl_op_bp_rr; +gemm_t* gemm4mh_cntl_mm_op_rr; +gemm_t* gemm4mh_cntl_vl_mm_rr; +gemm_t* gemm4mh_cntl_op_bp_ri; +gemm_t* gemm4mh_cntl_mm_op_ri; +gemm_t* gemm4mh_cntl_vl_mm_ri; +gemm_t* gemm4mh_cntl_op_bp_ir; +gemm_t* gemm4mh_cntl_mm_op_ir; +gemm_t* gemm4mh_cntl_vl_mm_ir; +gemm_t* gemm4mh_cntl_op_bp_ii; +gemm_t* gemm4mh_cntl_mm_op_ii; +gemm_t* gemm4mh_cntl_vl_mm_ii; + +gemm_t* gemm4mh_cntl_rr; +gemm_t* gemm4mh_cntl_ri; +gemm_t* gemm4mh_cntl_ir; +gemm_t* gemm4mh_cntl_ii; + + +void bli_gemm4mh_cntl_init() +{ + // Create blocksize objects for each dimension. + // NOTE: the complex blocksizes for 4mh are equal to their + // corresponding real domain counterparts. + gemm4mh_mc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MC_S, BLIS_MAXIMUM_MC_S, + BLIS_DEFAULT_MC_D, BLIS_MAXIMUM_MC_D ); + gemm4mh_nc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NC_S, BLIS_MAXIMUM_NC_S, + BLIS_DEFAULT_NC_D, BLIS_MAXIMUM_NC_D ); + gemm4mh_kc + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KC_S, BLIS_MAXIMUM_KC_S, + BLIS_DEFAULT_KC_D, BLIS_MAXIMUM_KC_D ); + gemm4mh_mr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_MR_S, BLIS_PACKDIM_MR_S, + BLIS_DEFAULT_MR_D, BLIS_PACKDIM_MR_D ); + gemm4mh_nr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_NR_S, BLIS_PACKDIM_NR_S, + BLIS_DEFAULT_NR_D, BLIS_PACKDIM_NR_D ); + gemm4mh_kr + = + bli_blksz_obj_create( 0, 0, + 0, 0, + BLIS_DEFAULT_KR_S, BLIS_PACKDIM_KR_S, + BLIS_DEFAULT_KR_D, BLIS_PACKDIM_KR_D ); + + + // Attach the register blksz_t objects as sub-blocksizes to the cache + // blksz_t objects. + bli_blksz_obj_attach_to( gemm4mh_mr, gemm4mh_mc ); + bli_blksz_obj_attach_to( gemm4mh_nr, gemm4mh_nc ); + bli_blksz_obj_attach_to( gemm4mh_kr, gemm4mh_kc ); + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm4mh_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM4MH_UKERNEL, BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM4MH_UKERNEL, BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS ); + + + // Create control tree objects for packm operations (real only). + gemm4mh_packa_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_mr, + gemm4mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_RO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm4mh_packb_cntl_ro + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_kr, + gemm4mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_RO, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (imag only). + gemm4mh_packa_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_mr, + gemm4mh_kr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_IO, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm4mh_packb_cntl_io + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_kr, + gemm4mh_nr, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_IO, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + gemm4mh_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4mh_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // + // Create control tree for A.real * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x real) + gemm4mh_cntl_op_bp_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_ro, + gemm4mh_packb_cntl_ro, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x real) + gemm4mh_cntl_mm_op_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_rr, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x real) + gemm4mh_cntl_vl_mm_rr + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_rr, + NULL ); + + // + // Create control tree for A.real * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (real x imag) + gemm4mh_cntl_op_bp_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_ro, + gemm4mh_packb_cntl_io, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (real x imag) + gemm4mh_cntl_mm_op_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ri, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (real x imag) + gemm4mh_cntl_vl_mm_ri + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ri, + NULL ); + + // + // Create control tree for A.imag * B.real. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x real) + gemm4mh_cntl_op_bp_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_io, + gemm4mh_packb_cntl_ro, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x real) + gemm4mh_cntl_mm_op_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ir, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x real) + gemm4mh_cntl_vl_mm_ir + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ir, + NULL ); + + // + // Create control tree for A.imag * B.imag. + // + + // Create control tree object for outer panel (to block-panel) + // problem. (imag x imag) + gemm4mh_cntl_op_bp_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4mh_mc, + gemm4mh_ukrs, + NULL, + gemm4mh_packa_cntl_io, + gemm4mh_packb_cntl_io, + NULL, + gemm4mh_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. (imag x imag) + gemm4mh_cntl_mm_op_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4mh_kc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_op_bp_ii, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. (imag x imag) + gemm4mh_cntl_vl_mm_ii + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4mh_nc, + gemm4mh_ukrs, + NULL, + NULL, + NULL, + NULL, + gemm4mh_cntl_mm_op_ii, + NULL ); + + + // Alias the "master" gemm control tree to a shorter name. + gemm4mh_cntl_rr = gemm4mh_cntl_vl_mm_rr; + gemm4mh_cntl_ri = gemm4mh_cntl_vl_mm_ri; + gemm4mh_cntl_ir = gemm4mh_cntl_vl_mm_ir; + gemm4mh_cntl_ii = gemm4mh_cntl_vl_mm_ii; + +} + +void bli_gemm4mh_cntl_finalize() +{ + bli_blksz_obj_free( gemm4mh_mc ); + bli_blksz_obj_free( gemm4mh_nc ); + bli_blksz_obj_free( gemm4mh_kc ); + bli_blksz_obj_free( gemm4mh_mr ); + bli_blksz_obj_free( gemm4mh_nr ); + bli_blksz_obj_free( gemm4mh_kr ); + + bli_func_obj_free( gemm4mh_ukrs ); + + bli_cntl_obj_free( gemm4mh_packa_cntl_ro ); + bli_cntl_obj_free( gemm4mh_packb_cntl_ro ); + bli_cntl_obj_free( gemm4mh_packa_cntl_io ); + bli_cntl_obj_free( gemm4mh_packb_cntl_io ); + + bli_cntl_obj_free( gemm4mh_cntl_bp_ke ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_rr ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_rr ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_rr ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ri ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ri ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ri ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ir ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ir ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ir ); + bli_cntl_obj_free( gemm4mh_cntl_op_bp_ii ); + bli_cntl_obj_free( gemm4mh_cntl_mm_op_ii ); + bli_cntl_obj_free( gemm4mh_cntl_vl_mm_ii ); + +} + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_cntl.h b/frame/3/gemm/4mh/bli_gemm4mh_cntl.h new file mode 100644 index 000000000..2ced05dd9 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm4mh_cntl_init( void ); +void bli_gemm4mh_cntl_finalize( void ); + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_entry.c b/frame/3/gemm/4mh/bli_gemm4mh_entry.c new file mode 100644 index 000000000..e3bf76e13 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ii; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; + +void bli_gemm4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_gemm_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_gemm_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/gemm/4mh/bli_gemm4mh_entry.h b/frame/3/gemm/4mh/bli_gemm4mh_entry.h new file mode 100644 index 000000000..904b7c7b7 --- /dev/null +++ b/frame/3/gemm/4mh/bli_gemm4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c new file mode 100644 index 000000000..ee6b9066b --- /dev/null +++ b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.c @@ -0,0 +1,271 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + ctype_r* restrict a_cast = ( ctype_r* )a; \ +\ + ctype_r* restrict b_cast = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + const pack_t schema_a = bli_auxinfo_schema_a( data ); \ + const pack_t schema_b = bli_auxinfo_schema_b( data ); \ +\ + dim_t n_iter; \ + dim_t n_elem; \ +\ + inc_t incc, ldc; \ + inc_t incct, ldct; \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 4mh method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* An optimization: Set local strides and loop bounds based on the + strides of c, so that (a) the micro-kernel accesses ct the same + way it would if it were updating c directly, and (b) c is updated + contiguously. */ \ + if ( bli_is_row_stored( rs_c, cs_c ) ) \ + { \ + rs_ct = n; n_iter = m; incc = cs_c; \ + cs_ct = 1; n_elem = n; ldc = rs_c; \ + } \ + else /* column-stored or general stride */ \ + { \ + rs_ct = 1; n_iter = n; incc = rs_c; \ + cs_ct = m; n_elem = m; ldc = cs_c; \ + } \ + incct = 1; \ + ldct = n_elem; \ +\ +\ + /* The following gemm micro-kernel call implement one "phase" of the + 4m method: + + c = beta * c; + c_r += a_r * b_r - a_i * b_i; + c_i += a_r * b_i + a_i * b_r; + + NOTE: Scaling by alpha_r is not shown for space reasons. */ \ +\ +\ + /* ct = alpha_r * a * b; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_cast, \ + b_cast, \ + zero_r, \ + ct, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* How we accumulate the intermediate matrix product stored in ct + depends on (a) the schemas of A and B, and (b) the value of + beta. */ \ + if ( bli_is_ro_packed( schema_a ) && \ + bli_is_ro_packed( schema_b ) ) \ + { \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ + { \ + /* c = beta * c; + c_r = c_r + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(ch,scals)( *beta, *gamma11 ); \ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + } \ + } \ + else if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + ct; + c_i = c_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \ + } \ + } \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ + { \ + /* c_r = beta_r * c_r + ct; + c_i = beta_r * c_i; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \ + PASTEMAC(chr,scals)( beta_r, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = ct; + c_i = 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \ + PASTEMAC(chr,set0s)( *gamma11_i ); \ + } \ + } \ + } \ + else if ( ( bli_is_ro_packed( schema_a ) && \ + bli_is_io_packed( schema_b ) ) || \ + ( bli_is_io_packed( schema_a ) && \ + bli_is_ro_packed( schema_b ) ) \ + ) \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r + 0; + c_i = c_i + ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = 0; + c_i = ct; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,set0s)( *gamma11_r ); \ + PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_io_packed( schema_a ) && \ + bli_is_io_packed( schema_b ) ) */ \ + { \ + if ( PASTEMAC(chr,eq1)( beta_r ) ) \ + { \ + /* c_r = c_r - ct; + c_i = c_i + 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ +\ + PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \ + } \ + } \ + else /* if PASTEMAC(chr,eq0)( beta_r ) */ \ + { \ + /* c_r = -ct; + c_i = 0; */ \ + for ( j = 0; j < n_iter; ++j ) \ + for ( i = 0; i < n_elem; ++i ) \ + { \ + const ctype_r gamma11t = *(ct + i*incct + j*ldct); \ + ctype* restrict gamma11 = c + i*incc + j*ldc ; \ + ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \ + ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \ +\ + PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \ + PASTEMAC(chr,set0s)( *gamma11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm4mh_ukr_ref, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h new file mode 100644 index 000000000..71000ef23 --- /dev/null +++ b/frame/3/gemm/4mh/ukernels/bli_gemm4mh_ukr_ref.h @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( gemm4mh_ukr_ref ) + diff --git a/frame/3/gemm/bli_gemm.c b/frame/3/gemm/bli_gemm.c index 6f6c8f7d1..3939f9ea2 100644 --- a/frame/3/gemm/bli_gemm.c +++ b/frame/3/gemm/bli_gemm.c @@ -43,12 +43,16 @@ void bli_gemm( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_gemm4m_entry( alpha, a, b, beta, c ); - else - bli_gemm_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_gemm3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_gemm3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_gemm4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_gemm4m_entry( alpha, a, b, beta, c ); + else bli_gemm_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index a39e6dbab..d66488ce7 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -33,6 +33,7 @@ */ #include "bli_gemm_cntl.h" +#include "bli_gemm_query.h" #include "bli_gemm_check.h" #include "bli_gemm_entry.h" #include "bli_gemm_front.h" @@ -50,6 +51,8 @@ #include "bli_gemm4m.h" #include "bli_gemm3m.h" +#include "bli_gemm4mh.h" +#include "bli_gemm3mh.h" // // Prototype object-based interface. diff --git a/frame/3/gemm/bli_gemm_query.c b/frame/3/gemm/bli_gemm_query.c new file mode 100644 index 000000000..256751c06 --- /dev/null +++ b/frame/3/gemm/bli_gemm_query.c @@ -0,0 +1,86 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemm3mh_ukrs; +extern func_t* gemm3m_ukrs; +extern func_t* gemm4mh_ukrs; +extern func_t* gemm4m_ukrs; +extern func_t* gemm_ukrs; + +func_t* bli_gemm_query_ukrs( num_t dt ) +{ + if ( bli_3mh_is_enabled_dt( dt ) ) return gemm3mh_ukrs; + else if ( bli_3m_is_enabled_dt( dt ) ) return gemm3m_ukrs; + else if ( bli_4mh_is_enabled_dt( dt ) ) return gemm4mh_ukrs; + else if ( bli_4m_is_enabled_dt( dt ) ) return gemm4m_ukrs; + else return gemm_ukrs; +} + +char* bli_gemm_query_impl_string( num_t dt ) +{ + if ( bli_3mh_is_enabled_dt( dt ) ) return bli_3mh_get_string(); + else if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4mh_is_enabled_dt( dt ) ) return bli_4mh_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + +kimpl_t bli_gemm_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemm_query_ukrs( dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMM_UKERNEL_REF || + p == BLIS_DGEMM_UKERNEL_REF || + p == BLIS_CGEMM_UKERNEL_REF || + p == BLIS_ZGEMM_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMM3MH_UKERNEL_REF || + p == BLIS_ZGEMM3MH_UKERNEL_REF || + p == BLIS_CGEMM3M_UKERNEL_REF || + p == BLIS_ZGEMM3M_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMM4MH_UKERNEL_REF || + p == BLIS_ZGEMM4MH_UKERNEL_REF || + p == BLIS_CGEMM4M_UKERNEL_REF || + p == BLIS_ZGEMM4M_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + diff --git a/frame/3/gemm/bli_gemm_query.h b/frame/3/gemm/bli_gemm_query.h new file mode 100644 index 000000000..5466c5938 --- /dev/null +++ b/frame/3/gemm/bli_gemm_query.h @@ -0,0 +1,38 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_gemm_query_ukrs( num_t dt ); +char* bli_gemm_query_impl_string( num_t dt ); + +kimpl_t bli_gemm_ukernel_impl_type( num_t dt ); diff --git a/frame/3/hemm/3mh/bli_hemm3mh.c b/frame/3/hemm/3mh/bli_hemm3mh.c new file mode 100644 index 000000000..337ab28d9 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_hemm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_hemm3mh_entry( side, alpha, a, b, beta, c ); + else + bli_hemm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm3mh, hemm3mh ) + diff --git a/frame/3/hemm/3mh/bli_hemm3mh.h b/frame/3/hemm/3mh/bli_hemm3mh.h new file mode 100644 index 000000000..60168cfb0 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_hemm3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_hemm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm3mh ) + diff --git a/frame/3/hemm/3mh/bli_hemm3mh_entry.c b/frame/3/hemm/3mh/bli_hemm3mh_entry.c new file mode 100644 index 000000000..2444e6b3f --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_hemm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/hemm/3mh/bli_hemm3mh_entry.h b/frame/3/hemm/3mh/bli_hemm3mh_entry.h new file mode 100644 index 000000000..08cb026a1 --- /dev/null +++ b/frame/3/hemm/3mh/bli_hemm3mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_hemm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/hemm/4mh/bli_hemm4mh.c b/frame/3/hemm/4mh/bli_hemm4mh.c new file mode 100644 index 000000000..4aec7b157 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_hemm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_hemm4mh_entry( side, alpha, a, b, beta, c ); + else + bli_hemm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm4mh, hemm4mh ) + diff --git a/frame/3/hemm/4mh/bli_hemm4mh.h b/frame/3/hemm/4mh/bli_hemm4mh.h new file mode 100644 index 000000000..e055ee347 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_hemm4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_hemm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm4mh ) + diff --git a/frame/3/hemm/4mh/bli_hemm4mh_entry.c b/frame/3/hemm/4mh/bli_hemm4mh_entry.c new file mode 100644 index 000000000..f9545ca85 --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_hemm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_hemm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/hemm/4mh/bli_hemm4mh_entry.h b/frame/3/hemm/4mh/bli_hemm4mh_entry.h new file mode 100644 index 000000000..0f5ee375f --- /dev/null +++ b/frame/3/hemm/4mh/bli_hemm4mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_hemm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/hemm/bli_hemm.c b/frame/3/hemm/bli_hemm.c index 212912fad..c1c62a0b8 100644 --- a/frame/3/hemm/bli_hemm.c +++ b/frame/3/hemm/bli_hemm.c @@ -44,12 +44,16 @@ void bli_hemm( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_hemm4m_entry( side, alpha, a, b, beta, c ); - else - bli_hemm_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_hemm3mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_hemm3m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_hemm4mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_hemm4m_entry( side, alpha, a, b, beta, c ); + else bli_hemm_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/hemm/bli_hemm.h b/frame/3/hemm/bli_hemm.h index 59f655684..3dba760a9 100644 --- a/frame/3/hemm/bli_hemm.h +++ b/frame/3/hemm/bli_hemm.h @@ -38,6 +38,8 @@ #include "bli_hemm4m.h" #include "bli_hemm3m.h" +#include "bli_hemm4mh.h" +#include "bli_hemm3mh.h" // @@ -50,6 +52,7 @@ void bli_hemm( side_t side, obj_t* beta, obj_t* c ); + // // Prototype BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/her2k/3mh/bli_her2k3mh.c b/frame/3/her2k/3mh/bli_her2k3mh.c new file mode 100644 index 000000000..c4a501c8c --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_her2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_her2k3mh_entry( alpha, a, b, beta, c ); + else + bli_her2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k3mh, her2k3mh ) + diff --git a/frame/3/her2k/3mh/bli_her2k3mh.h b/frame/3/her2k/3mh/bli_her2k3mh.h new file mode 100644 index 000000000..123dc5847 --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_her2k3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_her2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k3mh ) + diff --git a/frame/3/her2k/3mh/bli_her2k3mh_entry.c b/frame/3/her2k/3mh/bli_her2k3mh_entry.c new file mode 100644 index 000000000..e95ad844d --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_her2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_her2k_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/her2k/3mh/bli_her2k3mh_entry.h b/frame/3/her2k/3mh/bli_her2k3mh_entry.h new file mode 100644 index 000000000..699705637 --- /dev/null +++ b/frame/3/her2k/3mh/bli_her2k3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_her2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/her2k/4mh/bli_her2k4mh.c b/frame/3/her2k/4mh/bli_her2k4mh.c new file mode 100644 index 000000000..18e4f33e3 --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_her2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_her2k4mh_entry( alpha, a, b, beta, c ); + else + bli_her2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k4mh, her2k4mh ) + diff --git a/frame/3/her2k/4mh/bli_her2k4mh.h b/frame/3/her2k/4mh/bli_her2k4mh.h new file mode 100644 index 000000000..fa86a85bd --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_her2k4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_her2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k4mh ) + diff --git a/frame/3/her2k/4mh/bli_her2k4mh_entry.c b/frame/3/her2k/4mh/bli_her2k4mh_entry.c new file mode 100644 index 000000000..a122c4c6f --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_her2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_her2k_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_her2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/her2k/4mh/bli_her2k4mh_entry.h b/frame/3/her2k/4mh/bli_her2k4mh_entry.h new file mode 100644 index 000000000..706150811 --- /dev/null +++ b/frame/3/her2k/4mh/bli_her2k4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_her2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/her2k/bli_her2k.c b/frame/3/her2k/bli_her2k.c index 33f685495..74e1613df 100644 --- a/frame/3/her2k/bli_her2k.c +++ b/frame/3/her2k/bli_her2k.c @@ -43,12 +43,16 @@ void bli_her2k( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_her2k4m_entry( alpha, a, b, beta, c ); - else - bli_her2k_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_her2k3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_her2k3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_her2k4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_her2k4m_entry( alpha, a, b, beta, c ); + else bli_her2k_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h index 2f41d7e48..a1ffb8e51 100644 --- a/frame/3/her2k/bli_her2k.h +++ b/frame/3/her2k/bli_her2k.h @@ -52,6 +52,8 @@ #include "bli_her2k4m.h" #include "bli_her2k3m.h" +#include "bli_her2k4mh.h" +#include "bli_her2k3mh.h" // diff --git a/frame/3/herk/3mh/bli_herk3mh.c b/frame/3/herk/3mh/bli_herk3mh.c new file mode 100644 index 000000000..b3b7ae2db --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh.c @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_herk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_herk3mh_entry( alpha, a, beta, c ); + else + bli_herk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk3mh, herk3mh ) + diff --git a/frame/3/herk/3mh/bli_herk3mh.h b/frame/3/herk/3mh/bli_herk3mh.h new file mode 100644 index 000000000..b748db2c0 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_herk3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_herk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herk3mh ) + diff --git a/frame/3/herk/3mh/bli_herk3mh_entry.c b/frame/3/herk/3mh/bli_herk3mh_entry.c new file mode 100644 index 000000000..5e8be3635 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh_entry.c @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_herk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_herk_front( alpha, a, beta, c, gemm3mh_cntl_ro ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/herk/3mh/bli_herk3mh_entry.h b/frame/3/herk/3mh/bli_herk3mh_entry.h new file mode 100644 index 000000000..7ad1e67b4 --- /dev/null +++ b/frame/3/herk/3mh/bli_herk3mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/herk/4mh/bli_herk4mh.c b/frame/3/herk/4mh/bli_herk4mh.c new file mode 100644 index 000000000..7288881cd --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh.c @@ -0,0 +1,97 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_herk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_herk4mh_entry( alpha, a, beta, c ); + else + bli_herk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk4mh, herk4mh ) + diff --git a/frame/3/herk/4mh/bli_herk4mh.h b/frame/3/herk/4mh/bli_herk4mh.h new file mode 100644 index 000000000..aeff510dc --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_herk4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_herk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herk4mh ) + diff --git a/frame/3/herk/4mh/bli_herk4mh_entry.c b/frame/3/herk/4mh/bli_herk4mh_entry.c new file mode 100644 index 000000000..19009f715 --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_herk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_herk_front( alpha, a, beta, c, gemm4mh_cntl_rr ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_herk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/herk/4mh/bli_herk4mh_entry.h b/frame/3/herk/4mh/bli_herk4mh_entry.h new file mode 100644 index 000000000..a75a501eb --- /dev/null +++ b/frame/3/herk/4mh/bli_herk4mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/herk/bli_herk.c b/frame/3/herk/bli_herk.c index 9cd5c93cb..a56ff3971 100644 --- a/frame/3/herk/bli_herk.c +++ b/frame/3/herk/bli_herk.c @@ -42,12 +42,16 @@ void bli_herk( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_herk4m_entry( alpha, a, beta, c ); - else - bli_herk_entry( alpha, a, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_herk3mh_entry( alpha, a, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_herk3m_entry( alpha, a, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_herk4mh_entry( alpha, a, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_herk4m_entry( alpha, a, beta, c ); + else bli_herk_entry( alpha, a, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 4a81b2b9e..8c779fe91 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -48,6 +48,8 @@ #include "bli_herk4m.h" #include "bli_herk3m.h" +#include "bli_herk4mh.h" +#include "bli_herk3mh.h" // diff --git a/frame/3/symm/3mh/bli_symm3mh.c b/frame/3/symm/3mh/bli_symm3mh.c new file mode 100644 index 000000000..c79f367c0 --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_symm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_symm3mh_entry( side, alpha, a, b, beta, c ); + else + bli_symm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm3mh, symm3mh ) + diff --git a/frame/3/symm/3mh/bli_symm3mh.h b/frame/3/symm/3mh/bli_symm3mh.h new file mode 100644 index 000000000..d353a8651 --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_symm3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_symm3mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm3mh ) + diff --git a/frame/3/symm/3mh/bli_symm3mh_entry.c b/frame/3/symm/3mh/bli_symm3mh_entry.c new file mode 100644 index 000000000..1277cbf8f --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_symm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/symm/3mh/bli_symm3mh_entry.h b/frame/3/symm/3mh/bli_symm3mh_entry.h new file mode 100644 index 000000000..1030c3e9f --- /dev/null +++ b/frame/3/symm/3mh/bli_symm3mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_symm3mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/symm/4mh/bli_symm4mh.c b/frame/3/symm/4mh/bli_symm4mh.c new file mode 100644 index 000000000..ac62aeadc --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_symm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_symm4mh_entry( side, alpha, a, b, beta, c ); + else + bli_symm_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm4mh, symm4mh ) + diff --git a/frame/3/symm/4mh/bli_symm4mh.h b/frame/3/symm/4mh/bli_symm4mh.h new file mode 100644 index 000000000..e52ba4079 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_symm4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_symm4mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm4mh ) + diff --git a/frame/3/symm/4mh/bli_symm4mh_entry.c b/frame/3/symm/4mh/bli_symm4mh_entry.c new file mode 100644 index 000000000..e2322e1d0 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_symm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_symm_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/symm/4mh/bli_symm4mh_entry.h b/frame/3/symm/4mh/bli_symm4mh_entry.h new file mode 100644 index 000000000..05a416627 --- /dev/null +++ b/frame/3/symm/4mh/bli_symm4mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_symm4mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/symm/bli_symm.c b/frame/3/symm/bli_symm.c index 459b69cc3..d01232f65 100644 --- a/frame/3/symm/bli_symm.c +++ b/frame/3/symm/bli_symm.c @@ -44,12 +44,16 @@ void bli_symm( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_symm4m_entry( side, alpha, a, b, beta, c ); - else - bli_symm_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_symm3mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_symm3m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_symm4mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_symm4m_entry( side, alpha, a, b, beta, c ); + else bli_symm_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/symm/bli_symm.h b/frame/3/symm/bli_symm.h index 85b87b0b0..f9bddfa2f 100644 --- a/frame/3/symm/bli_symm.h +++ b/frame/3/symm/bli_symm.h @@ -38,6 +38,8 @@ #include "bli_symm4m.h" #include "bli_symm3m.h" +#include "bli_symm4mh.h" +#include "bli_symm3mh.h" // @@ -50,6 +52,7 @@ void bli_symm( side_t side, obj_t* beta, obj_t* c ); + // // Prototype BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh.c b/frame/3/syr2k/3mh/bli_syr2k3mh.c new file mode 100644 index 000000000..0fa00e953 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh.c @@ -0,0 +1,104 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syr2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syr2k3mh_entry( alpha, a, b, beta, c ); + else + bli_syr2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k3mh, syr2k3mh ) + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh.h b/frame/3/syr2k/3mh/bli_syr2k3mh.h new file mode 100644 index 000000000..e14739094 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syr2k3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syr2k3mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k3mh ) + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c new file mode 100644 index 000000000..74d5e24a0 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.c @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_syr2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_syr2k_front( alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h new file mode 100644 index 000000000..55f828542 --- /dev/null +++ b/frame/3/syr2k/3mh/bli_syr2k3mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syr2k3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh.c b/frame/3/syr2k/4mh/bli_syr2k4mh.c new file mode 100644 index 000000000..7fc5410d6 --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh.c @@ -0,0 +1,104 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syr2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syr2k4mh_entry( alpha, a, b, beta, c ); + else + bli_syr2k_entry( alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k4mh, syr2k4mh ) + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh.h b/frame/3/syr2k/4mh/bli_syr2k4mh.h new file mode 100644 index 000000000..2b0cfa4cb --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syr2k4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syr2k4mh( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k4mh ) + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c new file mode 100644 index 000000000..58218174f --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.c @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_syr2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_syr2k_front( alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_syr2k_front( alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h new file mode 100644 index 000000000..6e0e4cc9b --- /dev/null +++ b/frame/3/syr2k/4mh/bli_syr2k4mh_entry.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syr2k4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syr2k/bli_syr2k.c b/frame/3/syr2k/bli_syr2k.c index d56a98a5c..9fbc9d7a7 100644 --- a/frame/3/syr2k/bli_syr2k.c +++ b/frame/3/syr2k/bli_syr2k.c @@ -43,12 +43,16 @@ void bli_syr2k( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_syr2k4m_entry( alpha, a, b, beta, c ); - else - bli_syr2k_entry( alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_syr2k3mh_entry( alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_syr2k3m_entry( alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syr2k4mh_entry( alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_syr2k4m_entry( alpha, a, b, beta, c ); + else bli_syr2k_entry( alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h index 313164041..99c16dce7 100644 --- a/frame/3/syr2k/bli_syr2k.h +++ b/frame/3/syr2k/bli_syr2k.h @@ -38,6 +38,8 @@ #include "bli_syr2k4m.h" #include "bli_syr2k3m.h" +#include "bli_syr2k4mh.h" +#include "bli_syr2k3mh.h" // diff --git a/frame/3/syrk/3mh/bli_syrk3mh.c b/frame/3/syrk/3mh/bli_syrk3mh.c new file mode 100644 index 000000000..2822a732a --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syrk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syrk3mh_entry( alpha, a, beta, c ); + else + bli_syrk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk3mh, syrk3mh ) + diff --git a/frame/3/syrk/3mh/bli_syrk3mh.h b/frame/3/syrk/3mh/bli_syrk3mh.h new file mode 100644 index 000000000..c25d02926 --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syrk3mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syrk3mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk3mh ) + diff --git a/frame/3/syrk/3mh/bli_syrk3mh_entry.c b/frame/3/syrk/3mh/bli_syrk3mh_entry.c new file mode 100644 index 000000000..8f1e46143 --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh_entry.c @@ -0,0 +1,50 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_syrk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_syrk_front( alpha, a, beta, c, gemm3mh_cntl_ro ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/syrk/3mh/bli_syrk3mh_entry.h b/frame/3/syrk/3mh/bli_syrk3mh_entry.h new file mode 100644 index 000000000..f6b3c5e4d --- /dev/null +++ b/frame/3/syrk/3mh/bli_syrk3mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syrk3mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syrk/4mh/bli_syrk4mh.c b/frame/3/syrk/4mh/bli_syrk4mh.c new file mode 100644 index 000000000..8ff0cbc39 --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh.c @@ -0,0 +1,96 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_syrk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // implementation for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_syrk4mh_entry( alpha, a, beta, c ); + else + bli_syrk_entry( alpha, a, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk4mh, syrk4mh ) + diff --git a/frame/3/syrk/4mh/bli_syrk4mh.h b/frame/3/syrk/4mh/bli_syrk4mh.h new file mode 100644 index 000000000..9474d350b --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_syrk4mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_syrk4mh( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk4mh ) + diff --git a/frame/3/syrk/4mh/bli_syrk4mh_entry.c b/frame/3/syrk/4mh/bli_syrk4mh_entry.c new file mode 100644 index 000000000..d7942604f --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_syrk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + bli_syrk_front( alpha, a, beta, c, gemm4mh_cntl_rr ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_syrk_front( alpha, a, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/syrk/4mh/bli_syrk4mh_entry.h b/frame/3/syrk/4mh/bli_syrk4mh_entry.h new file mode 100644 index 000000000..1760db027 --- /dev/null +++ b/frame/3/syrk/4mh/bli_syrk4mh_entry.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_syrk4mh_entry( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/syrk/bli_syrk.c b/frame/3/syrk/bli_syrk.c index 2a09b430f..69dde1fcb 100644 --- a/frame/3/syrk/bli_syrk.c +++ b/frame/3/syrk/bli_syrk.c @@ -42,12 +42,16 @@ void bli_syrk( obj_t* alpha, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_syrk4m_entry( alpha, a, beta, c ); - else - bli_syrk_entry( alpha, a, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_syrk3mh_entry( alpha, a, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_syrk3m_entry( alpha, a, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_syrk4mh_entry( alpha, a, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_syrk4m_entry( alpha, a, beta, c ); + else bli_syrk_entry( alpha, a, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h index cba072d98..268b21b30 100644 --- a/frame/3/syrk/bli_syrk.h +++ b/frame/3/syrk/bli_syrk.h @@ -38,6 +38,8 @@ #include "bli_syrk4m.h" #include "bli_syrk3m.h" +#include "bli_syrk4mh.h" +#include "bli_syrk3mh.h" // diff --git a/frame/3/trmm/bli_trmm.c b/frame/3/trmm/bli_trmm.c index e178de8e6..7037876bb 100644 --- a/frame/3/trmm/bli_trmm.c +++ b/frame/3/trmm/bli_trmm.c @@ -42,12 +42,14 @@ void bli_trmm( side_t side, obj_t* a, obj_t* b ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *b ) ) ) - bli_trmm4m_entry( side, alpha, a, b ); - else - bli_trmm_entry( side, alpha, a, b ); + num_t dt = bli_obj_datatype( *b ); + + if ( bli_3m_is_enabled_dt( dt ) ) bli_trmm3m_entry( side, alpha, a, b ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trmm4m_entry( side, alpha, a, b ); + else bli_trmm_entry( side, alpha, a, b ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index 886824aa1..320b1d9eb 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -32,6 +32,7 @@ */ +#include "bli_trmm_query.h" #include "bli_trmm_check.h" #include "bli_trmm_entry.h" #include "bli_trmm_front.h" diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 1b6ce80cb..be85ea889 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_ll_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 3ae763214..83bd70b6d 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_lu_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_query.c b/frame/3/trmm/bli_trmm_query.c new file mode 100644 index 000000000..74b163c64 --- /dev/null +++ b/frame/3/trmm/bli_trmm_query.c @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemm3m_ukrs; +extern func_t* gemm4m_ukrs; +extern func_t* gemm_ukrs; + +func_t* bli_trmm_query_ukrs( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return gemm3m_ukrs; + else if ( bli_4m_is_enabled_dt( dt ) ) return gemm4m_ukrs; + else return gemm_ukrs; +} + +char* bli_trmm_query_impl_string( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + + diff --git a/frame/3/trmm/bli_trmm_query.h b/frame/3/trmm/bli_trmm_query.h new file mode 100644 index 000000000..27e468b31 --- /dev/null +++ b/frame/3/trmm/bli_trmm_query.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_trmm_query_ukrs( num_t dt ); +char* bli_trmm_query_impl_string( num_t dt ); diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 8c1760649..23ebb15d9 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_rl_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index f7894e584..ee3d4344d 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -120,7 +120,8 @@ void bli_trmm_ru_ker_var2( obj_t* a, // given the datatypes actually stored (float or double), we must // halve the strides to compensate. if ( bli_obj_is_4m_packed( *a ) || - bli_obj_is_3m_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } + bli_obj_is_3m_packed( *a ) || + bli_obj_is_rih_packed( *a ) ) { cs_a /= 2; rs_b /= 2; } // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the diff --git a/frame/3/trmm3/3mh/bli_trmm33mh.c b/frame/3/trmm3/3mh/bli_trmm33mh.c new file mode 100644 index 000000000..8c764a857 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh.c @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_trmm33mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 3mh only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_trmm33mh_entry( side, alpha, a, b, beta, c ); + else + bli_trmm3_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm33mh, trmm33mh ) + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh.h b/frame/3/trmm3/3mh/bli_trmm33mh.h new file mode 100644 index 000000000..ba2523d34 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm33mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_trmm33mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm33mh ) + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh_entry.c b/frame/3/trmm3/3mh/bli_trmm33mh_entry.c new file mode 100644 index 000000000..4dd92d9c2 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh_entry.c @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3mh_cntl_ro; +extern gemm_t* gemm3mh_cntl_io; +extern gemm_t* gemm3mh_cntl_rpi; + +void bli_trmm33mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_trmm3_front( side, alpha, a, b, beta, c, gemm3mh_cntl_ro ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_io ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm3mh_cntl_rpi ); +} + diff --git a/frame/3/trmm3/3mh/bli_trmm33mh_entry.h b/frame/3/trmm3/3mh/bli_trmm33mh_entry.h new file mode 100644 index 000000000..0e8934336 --- /dev/null +++ b/frame/3/trmm3/3mh/bli_trmm33mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm33mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh.c b/frame/3/trmm3/4mh/bli_trmm34mh.c new file mode 100644 index 000000000..bcf256429 --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh.c @@ -0,0 +1,109 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define object-based interface. +// +void bli_trmm34mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + // Since 4mh only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) + bli_trmm34mh_entry( side, alpha, a, b, beta, c ); + else + bli_trmm3_entry( side, alpha, a, b, beta, c ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm34mh, trmm34mh ) + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh.h b/frame/3/trmm3/4mh/bli_trmm34mh.h new file mode 100644 index 000000000..0e3e3f43b --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh.h @@ -0,0 +1,71 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm34mh_entry.h" + + +// +// Prototype object-based interface. +// +void bli_trmm34mh( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm34mh ) + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh_entry.c b/frame/3/trmm3/4mh/bli_trmm34mh_entry.c new file mode 100644 index 000000000..63548c2ad --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh_entry.c @@ -0,0 +1,54 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4mh_cntl_rr; +extern gemm_t* gemm4mh_cntl_ri; +extern gemm_t* gemm4mh_cntl_ir; +extern gemm_t* gemm4mh_cntl_ii; + +void bli_trmm34mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_trmm3_front( side, alpha, a, b, beta, c, gemm4mh_cntl_rr ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ii ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ri ); + bli_trmm3_front( side, alpha, a, b, &BLIS_ONE, c, gemm4mh_cntl_ir ); +} + diff --git a/frame/3/trmm3/4mh/bli_trmm34mh_entry.h b/frame/3/trmm3/4mh/bli_trmm34mh_entry.h new file mode 100644 index 000000000..818e1fdf8 --- /dev/null +++ b/frame/3/trmm3/4mh/bli_trmm34mh_entry.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm34mh_entry( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + diff --git a/frame/3/trmm3/bli_trmm3.c b/frame/3/trmm3/bli_trmm3.c index 37b588662..510908ff5 100644 --- a/frame/3/trmm3/bli_trmm3.c +++ b/frame/3/trmm3/bli_trmm3.c @@ -44,12 +44,16 @@ void bli_trmm3( side_t side, obj_t* beta, obj_t* c ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *c ) ) ) - bli_trmm34m_entry( side, alpha, a, b, beta, c ); - else - bli_trmm3_entry( side, alpha, a, b, beta, c ); + num_t dt = bli_obj_datatype( *c ); + + if ( bli_3mh_is_enabled_dt( dt ) ) bli_trmm33mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_3m_is_enabled_dt( dt ) ) bli_trmm33m_entry( side, alpha, a, b, beta, c ); + else if ( bli_4mh_is_enabled_dt( dt ) ) bli_trmm34mh_entry( side, alpha, a, b, beta, c ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trmm34m_entry( side, alpha, a, b, beta, c ); + else bli_trmm3_entry( side, alpha, a, b, beta, c ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trmm3/bli_trmm3.h b/frame/3/trmm3/bli_trmm3.h index e7e039d8a..ff53dece5 100644 --- a/frame/3/trmm3/bli_trmm3.h +++ b/frame/3/trmm3/bli_trmm3.h @@ -38,6 +38,8 @@ #include "bli_trmm34m.h" #include "bli_trmm33m.h" +#include "bli_trmm34mh.h" +#include "bli_trmm33mh.h" // diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.c b/frame/3/trsm/3m/bli_trsm3m_cntl.c index de00c536b..87acfb5b0 100644 --- a/frame/3/trsm/3m/bli_trsm3m_cntl.c +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.c @@ -48,6 +48,9 @@ extern func_t* gemm3m_ukrs; func_t* gemmtrsm3m_l_ukrs; func_t* gemmtrsm3m_u_ukrs; +func_t* trsm3m_l_ukrs; +func_t* trsm3m_u_ukrs; + packm_t* trsm3m_l_packa_cntl; packm_t* trsm3m_l_packb_cntl; @@ -88,6 +91,23 @@ void bli_trsm3m_cntl_init() BLIS_ZGEMMTRSM3M_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm3m_l and trsm3m_u micro-kernel. + trsm3m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM3M_L_UKERNEL, FALSE, + BLIS_ZTRSM3M_L_UKERNEL, FALSE ); + + trsm3m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM3M_U_UKERNEL, FALSE, + BLIS_ZTRSM3M_U_UKERNEL, FALSE ); + + // Create control tree objects for packm operations (left side). trsm3m_l_packa_cntl = @@ -260,6 +280,8 @@ void bli_trsm3m_cntl_finalize() { bli_func_obj_free( gemmtrsm3m_l_ukrs ); bli_func_obj_free( gemmtrsm3m_u_ukrs ); + bli_func_obj_free( trsm3m_l_ukrs ); + bli_func_obj_free( trsm3m_u_ukrs ); bli_cntl_obj_free( trsm3m_l_packa_cntl ); bli_cntl_obj_free( trsm3m_l_packb_cntl ); diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.c b/frame/3/trsm/4m/bli_trsm4m_cntl.c index 519d2ad26..7353d8f53 100644 --- a/frame/3/trsm/4m/bli_trsm4m_cntl.c +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.c @@ -48,6 +48,9 @@ extern func_t* gemm4m_ukrs; func_t* gemmtrsm4m_l_ukrs; func_t* gemmtrsm4m_u_ukrs; +func_t* trsm4m_l_ukrs; +func_t* trsm4m_u_ukrs; + packm_t* trsm4m_l_packa_cntl; packm_t* trsm4m_l_packb_cntl; @@ -88,6 +91,22 @@ void bli_trsm4m_cntl_init() BLIS_ZGEMMTRSM4M_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm4m_l and trsm4m_u micro-kernel. + trsm4m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM4M_L_UKERNEL, FALSE, + BLIS_ZTRSM4M_L_UKERNEL, FALSE ); + + trsm4m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CTRSM4M_U_UKERNEL, FALSE, + BLIS_ZTRSM4M_U_UKERNEL, FALSE ); + // Create control tree objects for packm operations (left side). trsm4m_l_packa_cntl @@ -261,6 +280,8 @@ void bli_trsm4m_cntl_finalize() { bli_func_obj_free( gemmtrsm4m_l_ukrs ); bli_func_obj_free( gemmtrsm4m_u_ukrs ); + bli_func_obj_free( trsm4m_l_ukrs ); + bli_func_obj_free( trsm4m_u_ukrs ); bli_cntl_obj_free( trsm4m_l_packa_cntl ); bli_cntl_obj_free( trsm4m_l_packb_cntl ); diff --git a/frame/3/trsm/bli_trsm.c b/frame/3/trsm/bli_trsm.c index 14c4983b1..4b74acf61 100644 --- a/frame/3/trsm/bli_trsm.c +++ b/frame/3/trsm/bli_trsm.c @@ -42,12 +42,14 @@ void bli_trsm( side_t side, obj_t* a, obj_t* b ) { - if ( bli_4m_is_enabled( bli_obj_datatype( *b ) ) ) - bli_trsm4m_entry( side, alpha, a, b ); - else - bli_trsm_entry( side, alpha, a, b ); + num_t dt = bli_obj_datatype( *b ); + + if ( bli_3m_is_enabled_dt( dt ) ) bli_trsm3m_entry( side, alpha, a, b ); + else if ( bli_4m_is_enabled_dt( dt ) ) bli_trsm4m_entry( side, alpha, a, b ); + else bli_trsm_entry( side, alpha, a, b ); } + // // Define BLAS-like interfaces with homogeneous-typed operands. // diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 9b434288d..c66c2ee74 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -33,6 +33,7 @@ */ #include "bli_trsm_cntl.h" +#include "bli_trsm_query.h" #include "bli_trsm_check.h" #include "bli_trsm_entry.h" #include "bli_trsm_front.h" diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index b78899bca..915150bbe 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,6 +50,9 @@ extern gemm_t* gemm_cntl_bp_ke; func_t* gemmtrsm_l_ukrs; func_t* gemmtrsm_u_ukrs; +func_t* trsm_l_ukrs; +func_t* trsm_u_ukrs; + packm_t* trsm_l_packa_cntl; packm_t* trsm_l_packb_cntl; @@ -90,6 +93,23 @@ void bli_trsm_cntl_init() BLIS_ZGEMMTRSM_U_UKERNEL, FALSE ); + // Create function pointer objects for each datatype-specific + // trsm_l and trsm_u micro-kernel. + trsm_l_ukrs + = + bli_func_obj_create( BLIS_STRSM_L_UKERNEL, FALSE, + BLIS_DTRSM_L_UKERNEL, FALSE, + BLIS_CTRSM_L_UKERNEL, FALSE, + BLIS_ZTRSM_L_UKERNEL, FALSE ); + + trsm_u_ukrs + = + bli_func_obj_create( BLIS_STRSM_U_UKERNEL, FALSE, + BLIS_DTRSM_U_UKERNEL, FALSE, + BLIS_CTRSM_U_UKERNEL, FALSE, + BLIS_ZTRSM_U_UKERNEL, FALSE ); + + // Create control tree objects for packm operations (left side). trsm_l_packa_cntl = @@ -262,6 +282,8 @@ void bli_trsm_cntl_finalize() { bli_func_obj_free( gemmtrsm_l_ukrs ); bli_func_obj_free( gemmtrsm_u_ukrs ); + bli_func_obj_free( trsm_l_ukrs ); + bli_func_obj_free( trsm_u_ukrs ); bli_cntl_obj_free( trsm_l_packa_cntl ); bli_cntl_obj_free( trsm_l_packb_cntl ); diff --git a/frame/3/trsm/bli_trsm_query.c b/frame/3/trsm/bli_trsm_query.c new file mode 100644 index 000000000..0a7ba82ae --- /dev/null +++ b/frame/3/trsm/bli_trsm_query.c @@ -0,0 +1,171 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern func_t* gemmtrsm3m_l_ukrs; +extern func_t* gemmtrsm3m_u_ukrs; +extern func_t* gemmtrsm4m_l_ukrs; +extern func_t* gemmtrsm4m_u_ukrs; +extern func_t* gemmtrsm_l_ukrs; +extern func_t* gemmtrsm_u_ukrs; + +extern func_t* trsm3m_l_ukrs; +extern func_t* trsm3m_u_ukrs; +extern func_t* trsm4m_l_ukrs; +extern func_t* trsm4m_u_ukrs; +extern func_t* trsm_l_ukrs; +extern func_t* trsm_u_ukrs; + +func_t* bli_gemmtrsm_query_ukrs( uplo_t uplo, num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? gemmtrsm3m_l_ukrs + : gemmtrsm3m_u_ukrs ); + else if ( bli_4m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? gemmtrsm4m_l_ukrs + : gemmtrsm4m_u_ukrs ); + else + return ( bli_is_lower( uplo ) ? gemmtrsm_l_ukrs + : gemmtrsm_u_ukrs ); +} + +func_t* bli_trsm_query_ukrs( uplo_t uplo, num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? trsm3m_l_ukrs + : trsm3m_u_ukrs ); + else if ( bli_4m_is_enabled_dt( dt ) ) + return ( bli_is_lower( uplo ) ? trsm4m_l_ukrs + : trsm4m_u_ukrs ); + else + return ( bli_is_lower( uplo ) ? trsm_l_ukrs + : trsm_u_ukrs ); +} + +char* bli_trsm_query_impl_string( num_t dt ) +{ + if ( bli_3m_is_enabled_dt( dt ) ) return bli_3m_get_string(); + else if ( bli_4m_is_enabled_dt( dt ) ) return bli_4m_get_string(); + else return bli_native_get_string(); +} + +kimpl_t bli_gemmtrsm_l_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemmtrsm_query_ukrs( BLIS_LOWER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMMTRSM_L_UKERNEL_REF || + p == BLIS_DGEMMTRSM_L_UKERNEL_REF || + p == BLIS_CGEMMTRSM_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM_L_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM3M_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM3M_L_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM4M_L_UKERNEL_REF || + p == BLIS_ZGEMMTRSM4M_L_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_gemmtrsm_u_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_gemmtrsm_query_ukrs( BLIS_UPPER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_SGEMMTRSM_U_UKERNEL_REF || + p == BLIS_DGEMMTRSM_U_UKERNEL_REF || + p == BLIS_CGEMMTRSM_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM_U_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM3M_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM3M_U_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CGEMMTRSM4M_U_UKERNEL_REF || + p == BLIS_ZGEMMTRSM4M_U_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_trsm_l_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_trsm_query_ukrs( BLIS_LOWER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_STRSM_L_UKERNEL_REF || + p == BLIS_DTRSM_L_UKERNEL_REF || + p == BLIS_CTRSM_L_UKERNEL_REF || + p == BLIS_ZTRSM_L_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CTRSM3M_L_UKERNEL_REF || + p == BLIS_ZTRSM3M_L_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CTRSM4M_L_UKERNEL_REF || + p == BLIS_ZTRSM4M_L_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + +kimpl_t bli_trsm_u_ukernel_impl_type( num_t dt ) +{ + func_t* ukrs = bli_trsm_query_ukrs( BLIS_UPPER, dt ); + void* p = bli_func_obj_query( dt, ukrs ); + + if ( p == BLIS_STRSM_U_UKERNEL_REF || + p == BLIS_DTRSM_U_UKERNEL_REF || + p == BLIS_CTRSM_U_UKERNEL_REF || + p == BLIS_ZTRSM_U_UKERNEL_REF + ) return BLIS_REFERENCE_UKERNEL; + else if ( + p == BLIS_CTRSM3M_U_UKERNEL_REF || + p == BLIS_ZTRSM3M_U_UKERNEL_REF + ) return BLIS_VIRTUAL3M_UKERNEL; + else if ( + p == BLIS_CTRSM4M_U_UKERNEL_REF || + p == BLIS_ZTRSM4M_U_UKERNEL_REF + ) return BLIS_VIRTUAL4M_UKERNEL; + else + return BLIS_OPTIMIZED_UKERNEL; +} + diff --git a/frame/3/trsm/bli_trsm_query.h b/frame/3/trsm/bli_trsm_query.h new file mode 100644 index 000000000..31f2043b8 --- /dev/null +++ b/frame/3/trsm/bli_trsm_query.h @@ -0,0 +1,42 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +func_t* bli_gemmtrsm_query_ukrs( uplo_t uplo, num_t dt ); +func_t* bli_trsm_query_ukrs( uplo_t uplo, num_t dt ); +char* bli_trsm_query_impl_string( num_t dt ); + +kimpl_t bli_gemmtrsm_l_ukernel_impl_type( num_t dt ); +kimpl_t bli_gemmtrsm_u_ukernel_impl_type( num_t dt ); +kimpl_t bli_trsm_l_ukernel_impl_type( num_t dt ); +kimpl_t bli_trsm_u_ukernel_impl_type( num_t dt ); diff --git a/frame/base/bli_3m.c b/frame/base/bli_3m.c new file mode 100644 index 000000000..c8f8420e9 --- /dev/null +++ b/frame/base/bli_3m.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_3m_str = "3m"; + +static bool_t bli_will_use_3m_c = FALSE; +static bool_t bli_will_use_3m_z = FALSE; + + +char* bli_3m_get_string( void ) { return bli_3m_str; } + +bool_t bli_3m_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_3m_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_3m_is_enabled_z(); + else return FALSE; +} +bool_t bli_3m_is_enabled_c( void ) { return bli_will_use_3m_c; } +bool_t bli_3m_is_enabled_z( void ) { return bli_will_use_3m_z; } + + +void bli_3m_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3m_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3m_enable_z(); +} +void bli_3m_enable_c( void ) { bli_will_use_3m_c = TRUE; } +void bli_3m_enable_z( void ) { bli_will_use_3m_z = TRUE; } +void bli_3m_enable( void ) { bli_will_use_3m_c = + bli_will_use_3m_z = TRUE; } + + +void bli_3m_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3m_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3m_disable_z(); +} + +void bli_3m_disable_c( void ) { bli_will_use_3m_c = FALSE; } +void bli_3m_disable_z( void ) { bli_will_use_3m_z = FALSE; } +void bli_3m_disable( void ) { bli_will_use_3m_c = + bli_will_use_3m_z = FALSE; } diff --git a/frame/base/bli_3m.h b/frame/base/bli_3m.h new file mode 100644 index 000000000..7a3557777 --- /dev/null +++ b/frame/base/bli_3m.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_3m_get_string( void ); + +bool_t bli_3m_is_enabled_dt( num_t dt ); +bool_t bli_3m_is_enabled_c( void ); +bool_t bli_3m_is_enabled_z( void ); + +void bli_3m_enable_dt( num_t dt ); +void bli_3m_enable_c( void ); +void bli_3m_enable_z( void ); +void bli_3m_enable( void ); + +void bli_3m_disable_dt( num_t dt ); +void bli_3m_disable_c( void ); +void bli_3m_disable_z( void ); +void bli_3m_disable( void ); diff --git a/frame/base/bli_3mh.c b/frame/base/bli_3mh.c new file mode 100644 index 000000000..6cabb69dc --- /dev/null +++ b/frame/base/bli_3mh.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_3mh_str = "3mh"; + +static bool_t bli_will_use_3mh_c = FALSE; +static bool_t bli_will_use_3mh_z = FALSE; + + +char* bli_3mh_get_string( void ) { return bli_3mh_str; } + +bool_t bli_3mh_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_3mh_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_3mh_is_enabled_z(); + else return FALSE; +} +bool_t bli_3mh_is_enabled_c( void ) { return bli_will_use_3mh_c; } +bool_t bli_3mh_is_enabled_z( void ) { return bli_will_use_3mh_z; } + + +void bli_3mh_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3mh_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3mh_enable_z(); +} +void bli_3mh_enable_c( void ) { bli_will_use_3mh_c = TRUE; } +void bli_3mh_enable_z( void ) { bli_will_use_3mh_z = TRUE; } +void bli_3mh_enable( void ) { bli_will_use_3mh_c = + bli_will_use_3mh_z = TRUE; } + + +void bli_3mh_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_3mh_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_3mh_disable_z(); +} + +void bli_3mh_disable_c( void ) { bli_will_use_3mh_c = FALSE; } +void bli_3mh_disable_z( void ) { bli_will_use_3mh_z = FALSE; } +void bli_3mh_disable( void ) { bli_will_use_3mh_c = + bli_will_use_3mh_z = FALSE; } diff --git a/frame/base/bli_3mh.h b/frame/base/bli_3mh.h new file mode 100644 index 000000000..4b9798275 --- /dev/null +++ b/frame/base/bli_3mh.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_3mh_get_string( void ); + +bool_t bli_3mh_is_enabled_dt( num_t dt ); +bool_t bli_3mh_is_enabled_c( void ); +bool_t bli_3mh_is_enabled_z( void ); + +void bli_3mh_enable_dt( num_t dt ); +void bli_3mh_enable_c( void ); +void bli_3mh_enable_z( void ); +void bli_3mh_enable( void ); + +void bli_3mh_disable_dt( num_t dt ); +void bli_3mh_disable_c( void ); +void bli_3mh_disable_z( void ); +void bli_3mh_disable( void ); diff --git a/frame/base/bli_4m.c b/frame/base/bli_4m.c index f188b5300..7fe0b698b 100644 --- a/frame/base/bli_4m.c +++ b/frame/base/bli_4m.c @@ -34,23 +34,28 @@ #include "blis.h" +static char* bli_native_str = "native"; +static char* bli_4m_str = "4m"; + // Initialize the 4m enabled/disabled state based on the cpp macros -// BLIS_ENABLE_SCOMPLEX_VIA_4M and BLIS_ENABLE_DCOMPLEX_VIA_4M, which -// are set in bli_kernel_macro_defs.h. -#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M +// which are set in bli_kernel_macro_defs.h. +#ifdef BLIS_ENABLE_VIRTUAL_SCOMPLEX static bool_t bli_will_use_4m_c = TRUE; #else static bool_t bli_will_use_4m_c = FALSE; #endif -#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M +#ifdef BLIS_ENABLE_VIRTUAL_DCOMPLEX static bool_t bli_will_use_4m_z = TRUE; #else static bool_t bli_will_use_4m_z = FALSE; #endif -bool_t bli_4m_is_enabled( num_t dt ) +char* bli_native_get_string( void ) { return bli_native_str; } +char* bli_4m_get_string( void ) { return bli_4m_str; } + +bool_t bli_4m_is_enabled_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) return bli_4m_is_enabled_c(); else if ( bli_is_dcomplex( dt ) ) return bli_4m_is_enabled_z(); @@ -60,18 +65,18 @@ bool_t bli_4m_is_enabled_c( void ) { return bli_will_use_4m_c; } bool_t bli_4m_is_enabled_z( void ) { return bli_will_use_4m_z; } -void bli_4m_enable( num_t dt ) +void bli_4m_enable_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) bli_4m_enable_c(); else if ( bli_is_dcomplex( dt ) ) bli_4m_enable_z(); } void bli_4m_enable_c( void ) { bli_will_use_4m_c = TRUE; } void bli_4m_enable_z( void ) { bli_will_use_4m_z = TRUE; } -void bli_4m_enable_cz( void ) { bli_will_use_4m_c = +void bli_4m_enable( void ) { bli_will_use_4m_c = bli_will_use_4m_z = TRUE; } -void bli_4m_disable( num_t dt ) +void bli_4m_disable_dt( num_t dt ) { if ( bli_is_scomplex( dt ) ) bli_4m_disable_c(); else if ( bli_is_dcomplex( dt ) ) bli_4m_disable_z(); @@ -79,5 +84,5 @@ void bli_4m_disable( num_t dt ) void bli_4m_disable_c( void ) { bli_will_use_4m_c = FALSE; } void bli_4m_disable_z( void ) { bli_will_use_4m_z = FALSE; } -void bli_4m_disable_cz( void ) { bli_will_use_4m_c = +void bli_4m_disable( void ) { bli_will_use_4m_c = bli_will_use_4m_z = FALSE; } diff --git a/frame/base/bli_4m.h b/frame/base/bli_4m.h index 9791d70b8..db497820e 100644 --- a/frame/base/bli_4m.h +++ b/frame/base/bli_4m.h @@ -32,16 +32,19 @@ */ -bool_t bli_4m_is_enabled( num_t dt ); +char* bli_native_get_string( void ); +char* bli_4m_get_string( void ); + +bool_t bli_4m_is_enabled_dt( num_t dt ); bool_t bli_4m_is_enabled_c( void ); bool_t bli_4m_is_enabled_z( void ); -void bli_4m_enable( num_t dt ); +void bli_4m_enable_dt( num_t dt ); void bli_4m_enable_c( void ); void bli_4m_enable_z( void ); -void bli_4m_enable_cz( void ); +void bli_4m_enable( void ); -void bli_4m_disable( num_t dt ); +void bli_4m_disable_dt( num_t dt ); void bli_4m_disable_c( void ); void bli_4m_disable_z( void ); -void bli_4m_disable_cz( void ); +void bli_4m_disable( void ); diff --git a/frame/base/bli_4mh.c b/frame/base/bli_4mh.c new file mode 100644 index 000000000..110961b84 --- /dev/null +++ b/frame/base/bli_4mh.c @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static char* bli_4mh_str = "4mh"; + +static bool_t bli_will_use_4mh_c = FALSE; +static bool_t bli_will_use_4mh_z = FALSE; + + +char* bli_4mh_get_string( void ) { return bli_4mh_str; } + +bool_t bli_4mh_is_enabled_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) return bli_4mh_is_enabled_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_4mh_is_enabled_z(); + else return FALSE; +} +bool_t bli_4mh_is_enabled_c( void ) { return bli_will_use_4mh_c; } +bool_t bli_4mh_is_enabled_z( void ) { return bli_will_use_4mh_z; } + + +void bli_4mh_enable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_4mh_enable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_4mh_enable_z(); +} +void bli_4mh_enable_c( void ) { bli_will_use_4mh_c = TRUE; } +void bli_4mh_enable_z( void ) { bli_will_use_4mh_z = TRUE; } +void bli_4mh_enable( void ) { bli_will_use_4mh_c = + bli_will_use_4mh_z = TRUE; } + + +void bli_4mh_disable_dt( num_t dt ) +{ + if ( bli_is_scomplex( dt ) ) bli_4mh_disable_c(); + else if ( bli_is_dcomplex( dt ) ) bli_4mh_disable_z(); +} + +void bli_4mh_disable_c( void ) { bli_will_use_4mh_c = FALSE; } +void bli_4mh_disable_z( void ) { bli_will_use_4mh_z = FALSE; } +void bli_4mh_disable( void ) { bli_will_use_4mh_c = + bli_will_use_4mh_z = FALSE; } diff --git a/frame/base/bli_4mh.h b/frame/base/bli_4mh.h new file mode 100644 index 000000000..050086732 --- /dev/null +++ b/frame/base/bli_4mh.h @@ -0,0 +1,49 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +char* bli_4mh_get_string( void ); + +bool_t bli_4mh_is_enabled_dt( num_t dt ); +bool_t bli_4mh_is_enabled_c( void ); +bool_t bli_4mh_is_enabled_z( void ); + +void bli_4mh_enable_dt( num_t dt ); +void bli_4mh_enable_c( void ); +void bli_4mh_enable_z( void ); +void bli_4mh_enable( void ); + +void bli_4mh_disable_dt( num_t dt ); +void bli_4mh_disable_c( void ); +void bli_4mh_disable_z( void ); +void bli_4mh_disable( void ); diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index d22c5911b..2f8fa70ac 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -421,7 +421,7 @@ gint_t bli_info_get_dotxf_fuse_fac_c( void ) { return BLIS_DOTXF_FUSE_FAC_C; } gint_t bli_info_get_dotxf_fuse_fac_z( void ) { return BLIS_DOTXF_FUSE_FAC_Z; } -// dotxf +// dotxaxpyf gint_t bli_info_get_dotxaxpyf_fuse_fac( num_t dt ) { @@ -437,6 +437,39 @@ gint_t bli_info_get_dotxaxpyf_fuse_fac_c( void ) { return BLIS_DOTXAXPYF_FUSE_FA gint_t bli_info_get_dotxaxpyf_fuse_fac_z( void ) { return BLIS_DOTXAXPYF_FUSE_FAC_Z; } +// -- Level-3 kernel definitions -- + +static char* ukr_type_str[4] = { "refnce", + "virt4m", + "virt3m", + "optmzd" }; + +char* bli_info_get_gemm_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemm_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_gemmtrsm_l_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemmtrsm_l_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_gemmtrsm_u_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_gemmtrsm_u_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_trsm_l_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_trsm_l_ukernel_impl_type( dt ) ]; +} + +char* bli_info_get_trsm_u_ukr_type( num_t dt ) +{ + return ukr_type_str[ bli_trsm_u_ukernel_impl_type( dt ) ]; +} + + // -- bli_mem_pool_macro_defs.h ------------------------------------------------ @@ -444,3 +477,18 @@ gint_t bli_info_get_mk_pool_size( void ) { return BLIS_MK_POOL_SIZE; } gint_t bli_info_get_kn_pool_size( void ) { return BLIS_KN_POOL_SIZE; } gint_t bli_info_get_mn_pool_size( void ) { return BLIS_MN_POOL_SIZE; } + + +// -- BLIS implementation query (level-3) -------------------------------------- + +char* bli_info_get_gemm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_hemm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_herk_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_her2k_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_symm_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_syrk_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_syr2k_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_trmm_impl_string( num_t dt ) { bli_init(); return bli_trmm_query_impl_string( dt ); } +char* bli_info_get_trmm3_impl_string( num_t dt ) { bli_init(); return bli_gemm_query_impl_string( dt ); } +char* bli_info_get_trsm_impl_string( num_t dt ) { bli_init(); return bli_trsm_query_impl_string( dt ); } + diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 7c087e6d7..7e0dbb2a5 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -181,9 +181,32 @@ gint_t bli_info_get_dotxaxpyf_fuse_fac_c( void ); gint_t bli_info_get_dotxaxpyf_fuse_fac_z( void ); +// -- Level-3 kernel definitions -- + +char* bli_info_get_gemm_ukr_type( num_t dt ); +char* bli_info_get_gemmtrsm_l_ukr_type( num_t dt ); +char* bli_info_get_gemmtrsm_u_ukr_type( num_t dt ); +char* bli_info_get_trsm_l_ukr_type( num_t dt ); +char* bli_info_get_trsm_u_ukr_type( num_t dt ); + + // -- bli_mem_pool_macro_defs.h ------------------------------------------------ gint_t bli_info_get_mk_pool_size( void ); gint_t bli_info_get_kn_pool_size( void ); gint_t bli_info_get_mn_pool_size( void ); + +// -- BLIS implementation query (level-3) -------------------------------------- + +char* bli_info_get_gemm_impl_string( num_t dt ); +char* bli_info_get_hemm_impl_string( num_t dt ); +char* bli_info_get_herk_impl_string( num_t dt ); +char* bli_info_get_her2k_impl_string( num_t dt ); +char* bli_info_get_symm_impl_string( num_t dt ); +char* bli_info_get_syrk_impl_string( num_t dt ); +char* bli_info_get_syr2k_impl_string( num_t dt ); +char* bli_info_get_trmm_impl_string( num_t dt ); +char* bli_info_get_trmm3_impl_string( num_t dt ); +char* bli_info_get_trsm_impl_string( num_t dt ); + diff --git a/frame/cntl/bli_cntl_init.c b/frame/cntl/bli_cntl_init.c index 3f885cd7e..cec9ffd0f 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/cntl/bli_cntl_init.c @@ -66,6 +66,12 @@ void bli_cntl_init( void ) // Level-3 via 3m bli_gemm3m_cntl_init(); bli_trsm3m_cntl_init(); + + // Level-3 via 4mh + bli_gemm4mh_cntl_init(); + + // Level-3 via 3mh + bli_gemm3mh_cntl_init(); } void bli_cntl_finalize( void ) @@ -100,5 +106,11 @@ void bli_cntl_finalize( void ) // Level-3 via 3m bli_gemm3m_cntl_finalize(); bli_trsm3m_cntl_finalize(); + + // Level-3 via 4mh + bli_gemm4mh_cntl_finalize(); + + // Level-3 via 3mh + bli_gemm3mh_cntl_finalize(); } diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 67502abc2..8b784e908 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -93,14 +93,15 @@ #ifndef BLIS_CGEMM_UKERNEL #define BLIS_CGEMM_UKERNEL BLIS_CGEMM_UKERNEL_REF #ifdef BLIS_SGEMM_UKERNEL -#define BLIS_ENABLE_SCOMPLEX_VIA_4M +#define BLIS_ENABLE_VIRTUAL_SCOMPLEX #endif +#else #endif #ifndef BLIS_ZGEMM_UKERNEL #define BLIS_ZGEMM_UKERNEL BLIS_ZGEMM_UKERNEL_REF #ifdef BLIS_DGEMM_UKERNEL -#define BLIS_ENABLE_DCOMPLEX_VIA_4M +#define BLIS_ENABLE_VIRTUAL_DCOMPLEX #endif #endif diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index cfaae8d3c..47d2d3010 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -135,6 +135,24 @@ #define BLIS_ZTRSM3M_U_UKERNEL_REF bli_ztrsm3m_u_ukr_ref // +// Level-3 4mh +// + +// gemm4mh micro-kernels + +#define BLIS_CGEMM4MH_UKERNEL_REF bli_cgemm4mh_ukr_ref +#define BLIS_ZGEMM4MH_UKERNEL_REF bli_zgemm4mh_ukr_ref + +// +// +// Level-3 3mh +// + +// gemm3mh micro-kernels + +#define BLIS_CGEMM3MH_UKERNEL_REF bli_cgemm3mh_ukr_ref +#define BLIS_ZGEMM3MH_UKERNEL_REF bli_zgemm3mh_ukr_ref + // Level-1m // @@ -274,6 +292,46 @@ #define BLIS_CPACKM_16XK_3M_KERNEL_REF bli_cpackm_ref_16xk_3m #define BLIS_ZPACKM_16XK_3M_KERNEL_REF bli_zpackm_ref_16xk_3m +// packm_2xk_rih kernels + +#define BLIS_CPACKM_2XK_RIH_KERNEL_REF bli_cpackm_ref_2xk_rih +#define BLIS_ZPACKM_2XK_RIH_KERNEL_REF bli_zpackm_ref_2xk_rih + +// packm_4xk_rih kernels + +#define BLIS_CPACKM_4XK_RIH_KERNEL_REF bli_cpackm_ref_4xk_rih +#define BLIS_ZPACKM_4XK_RIH_KERNEL_REF bli_zpackm_ref_4xk_rih + +// packm_6xk_rih kernels + +#define BLIS_CPACKM_6XK_RIH_KERNEL_REF bli_cpackm_ref_6xk_rih +#define BLIS_ZPACKM_6XK_RIH_KERNEL_REF bli_zpackm_ref_6xk_rih + +// packm_8xk_rih kernels + +#define BLIS_CPACKM_8XK_RIH_KERNEL_REF bli_cpackm_ref_8xk_rih +#define BLIS_ZPACKM_8XK_RIH_KERNEL_REF bli_zpackm_ref_8xk_rih + +// packm_10xk_rih kernels + +#define BLIS_CPACKM_10XK_RIH_KERNEL_REF bli_cpackm_ref_10xk_rih +#define BLIS_ZPACKM_10XK_RIH_KERNEL_REF bli_zpackm_ref_10xk_rih + +// packm_12xk_rih kernels + +#define BLIS_CPACKM_12XK_RIH_KERNEL_REF bli_cpackm_ref_12xk_rih +#define BLIS_ZPACKM_12XK_RIH_KERNEL_REF bli_zpackm_ref_12xk_rih + +// packm_14xk_rih kernels + +#define BLIS_CPACKM_14XK_RIH_KERNEL_REF bli_cpackm_ref_14xk_rih +#define BLIS_ZPACKM_14XK_RIH_KERNEL_REF bli_zpackm_ref_14xk_rih + +// packm_16xk_rih kernels + +#define BLIS_CPACKM_16XK_RIH_KERNEL_REF bli_cpackm_ref_16xk_rih +#define BLIS_ZPACKM_16XK_RIH_KERNEL_REF bli_zpackm_ref_16xk_rih + // unpack_2xk kernels #define BLIS_SUNPACKM_2XK_KERNEL_REF bli_sunpackm_ref_2xk diff --git a/frame/include/bli_kernel_rih_macro_defs.h b/frame/include/bli_kernel_rih_macro_defs.h new file mode 100644 index 000000000..fae464810 --- /dev/null +++ b/frame/include/bli_kernel_rih_macro_defs.h @@ -0,0 +1,168 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_RIH_MACRO_DEFS_H +#define BLIS_KERNEL_RIH_MACRO_DEFS_H + + +// -- Define 4mh/3mh row access bools ------------------------------------------ + +// gemm4mh micro-kernels + +#define BLIS_CGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM4MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + +// gemm3mh micro-kernels + +#define BLIS_CGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM3MH_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + +// -- Define default 4mh/3mh-specific kernel names ----------------------------- + +// +// Level-3 +// + +// gemm4mh micro-kernels + +#ifndef BLIS_CGEMM4MH_UKERNEL +#define BLIS_CGEMM4MH_UKERNEL BLIS_CGEMM4MH_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM4MH_UKERNEL +#define BLIS_ZGEMM4MH_UKERNEL BLIS_ZGEMM4MH_UKERNEL_REF +#endif + +// gemm3mh micro-kernels + +#ifndef BLIS_CGEMM3MH_UKERNEL +#define BLIS_CGEMM3MH_UKERNEL BLIS_CGEMM3MH_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM3MH_UKERNEL +#define BLIS_ZGEMM3MH_UKERNEL BLIS_ZGEMM3MH_UKERNEL_REF +#endif + +// +// Level-1m +// + +// packm_2xk_rih kernels + +#ifndef BLIS_CPACKM_2XK_RIH_KERNEL +#define BLIS_CPACKM_2XK_RIH_KERNEL BLIS_CPACKM_2XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_RIH_KERNEL +#define BLIS_ZPACKM_2XK_RIH_KERNEL BLIS_ZPACKM_2XK_RIH_KERNEL_REF +#endif + +// packm_4xk_rih kernels + +#ifndef BLIS_CPACKM_4XK_RIH_KERNEL +#define BLIS_CPACKM_4XK_RIH_KERNEL BLIS_CPACKM_4XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_RIH_KERNEL +#define BLIS_ZPACKM_4XK_RIH_KERNEL BLIS_ZPACKM_4XK_RIH_KERNEL_REF +#endif + +// packm_6xk_rih kernels + +#ifndef BLIS_CPACKM_6XK_RIH_KERNEL +#define BLIS_CPACKM_6XK_RIH_KERNEL BLIS_CPACKM_6XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_RIH_KERNEL +#define BLIS_ZPACKM_6XK_RIH_KERNEL BLIS_ZPACKM_6XK_RIH_KERNEL_REF +#endif + +// packm_8xk_rih kernels + +#ifndef BLIS_CPACKM_8XK_RIH_KERNEL +#define BLIS_CPACKM_8XK_RIH_KERNEL BLIS_CPACKM_8XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_RIH_KERNEL +#define BLIS_ZPACKM_8XK_RIH_KERNEL BLIS_ZPACKM_8XK_RIH_KERNEL_REF +#endif + +// packm_10xk_rih kernels + +#ifndef BLIS_CPACKM_10XK_RIH_KERNEL +#define BLIS_CPACKM_10XK_RIH_KERNEL BLIS_CPACKM_10XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_RIH_KERNEL +#define BLIS_ZPACKM_10XK_RIH_KERNEL BLIS_ZPACKM_10XK_RIH_KERNEL_REF +#endif + +// packm_12xk_rih kernels + +#ifndef BLIS_CPACKM_12XK_RIH_KERNEL +#define BLIS_CPACKM_12XK_RIH_KERNEL BLIS_CPACKM_12XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_RIH_KERNEL +#define BLIS_ZPACKM_12XK_RIH_KERNEL BLIS_ZPACKM_12XK_RIH_KERNEL_REF +#endif + +// packm_14xk_rih kernels + +#ifndef BLIS_CPACKM_14XK_RIH_KERNEL +#define BLIS_CPACKM_14XK_RIH_KERNEL BLIS_CPACKM_14XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_RIH_KERNEL +#define BLIS_ZPACKM_14XK_RIH_KERNEL BLIS_ZPACKM_14XK_RIH_KERNEL_REF +#endif + +// packm_16xk_rih kernels + +#ifndef BLIS_CPACKM_16XK_RIH_KERNEL +#define BLIS_CPACKM_16XK_RIH_KERNEL BLIS_CPACKM_16XK_RIH_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_RIH_KERNEL +#define BLIS_ZPACKM_16XK_RIH_KERNEL BLIS_ZPACKM_16XK_RIH_KERNEL_REF +#endif + + + +#endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 1199f219c..5c4195932 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -221,6 +221,24 @@ \ ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M ) +#define bli_obj_is_ro_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO ) + +#define bli_obj_is_io_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO ) + +#define bli_obj_is_rpi_packed( obj ) \ +\ + ( ( (obj).info & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI ) + +#define bli_obj_is_rih_packed( obj ) \ +\ + ( bli_obj_is_ro_packed( obj ) || \ + bli_obj_is_io_packed( obj ) || \ + bli_obj_is_rpi_packed( obj ) ) + #define bli_obj_pack_buffer_type( obj ) \ \ ( (obj).info & BLIS_PACK_BUFFER_BITS ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index e1fd135a5..0a1e1cce8 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -356,7 +356,6 @@ ( bli_does_notrans( trans ) ? ( m == 1 ? (cs) : (rs) ) \ : ( m == 1 ? (rs) : (cs) ) ) -/* #define bli_is_row_stored( rs, cs ) \ \ ( bli_abs( cs ) == 1 ) @@ -364,7 +363,6 @@ #define bli_is_col_stored( rs, cs ) \ \ ( bli_abs( rs ) == 1 ) -*/ #define bli_is_row_stored_f( m, n, rs, cs ) \ \ @@ -530,6 +528,25 @@ \ ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3M ) +#define bli_is_ro_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO ) + +#define bli_is_io_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO ) + +#define bli_is_rpi_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI ) + +#define bli_is_rih_packed( schema ) \ +\ + ( bli_is_ro_packed( schema ) || \ + bli_is_io_packed( schema ) || \ + bli_is_rpi_packed( schema ) ) + + // return datatype for char diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index d5134da39..832dd9f48 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -197,6 +197,25 @@ #include "bli_scal2jri3s.h" +// -- 4mh/3mh-specific scalar macros -- + +#include "bli_scal2rihs_mxn_diag.h" +#include "bli_scal2rihs_mxn_uplo.h" +#include "bli_setrihs_mxn_diag.h" + +// ro +#include "bli_scal2ros.h" +#include "bli_scal2jros.h" + +// io +#include "bli_scal2ios.h" +#include "bli_scal2jios.h" + +// rpi +#include "bli_scal2rpis.h" +#include "bli_scal2jrpis.h" + + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 3ee95b16a..c45c3f120 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -209,6 +209,12 @@ typedef dcomplex f77_dcomplex; - 100111: packed by 4m column panels - 101010: packed by 3m row panels - 101011: packed by 3m column panels + - 110010: packed real-only row panels + - 110011: packed real-only column panels + - 110110: packed imag-only row panels + - 110111: packed imag-only column panels + - 111010: packed real+imag row panels + - 111011: packed real+imag column panels 22 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper @@ -309,6 +315,9 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_4M ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_3M ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC BLIS_PACK_BIT #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) @@ -318,6 +327,12 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_PACKED_COL_PANELS_4M ( BLIS_PACK_BIT | BLIS_BITVAL_4M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_3M ( BLIS_PACK_BIT | BLIS_BITVAL_3M | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -427,6 +442,12 @@ typedef enum BLIS_PACKED_COL_PANELS_4M = BLIS_BITVAL_PACKED_COL_PANELS_4M, BLIS_PACKED_ROW_PANELS_3M = BLIS_BITVAL_PACKED_ROW_PANELS_3M, BLIS_PACKED_COL_PANELS_3M = BLIS_BITVAL_PACKED_COL_PANELS_3M, + BLIS_PACKED_ROW_PANELS_RO = BLIS_BITVAL_PACKED_ROW_PANELS_RO, + BLIS_PACKED_COL_PANELS_RO = BLIS_BITVAL_PACKED_COL_PANELS_RO, + BLIS_PACKED_ROW_PANELS_IO = BLIS_BITVAL_PACKED_ROW_PANELS_IO, + BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, + BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, + BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, } pack_t; @@ -453,6 +474,17 @@ typedef enum } packbuf_t; +// -- micro-kernel implementation type -- + +typedef enum +{ + BLIS_REFERENCE_UKERNEL = 0, + BLIS_VIRTUAL4M_UKERNEL, + BLIS_VIRTUAL3M_UKERNEL, + BLIS_OPTIMIZED_UKERNEL, +} kimpl_t; + + // // -- BLIS misc. structure types ----------------------------------------------- // diff --git a/frame/include/blis.h b/frame/include/blis.h index 446ad27ad..591986893 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -84,6 +84,7 @@ extern "C" { #include "bli_kernel_macro_defs.h" #include "bli_kernel_4m_macro_defs.h" #include "bli_kernel_3m_macro_defs.h" +#include "bli_kernel_rih_macro_defs.h" #include "bli_kernel_post_macro_defs.h" #include "bli_kernel_prototypes.h" @@ -113,6 +114,9 @@ extern "C" { #include "bli_info.h" #include "bli_getopt.h" #include "bli_4m.h" +#include "bli_3m.h" +#include "bli_4mh.h" +#include "bli_3mh.h" // Control tree definitions. #include "bli_cntl.h" diff --git a/frame/include/level0/io/bli_scal2ios.h b/frame/include/level0/io/bli_scal2ios.h new file mode 100644 index 000000000..268f8ebf0 --- /dev/null +++ b/frame/include/level0/io/bli_scal2ios.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2IOS_H +#define BLIS_SCAL2IOS_H + +// scal2ios + +#define bli_cscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_cimag(a) * bli_creal(x) + bli_creal(a) * bli_cimag(x); \ +} + +#define bli_zscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_zimag(a) * bli_zreal(x) + bli_zreal(a) * bli_zimag(x); \ +} + +#define bli_scscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_creal(a) * bli_cimag(x); \ +} + +#define bli_dzscal2ios( a, x, yi ) \ +{ \ + (yi) = bli_zreal(a) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/io/bli_scal2jios.h b/frame/include/level0/io/bli_scal2jios.h new file mode 100644 index 000000000..55038b5d3 --- /dev/null +++ b/frame/include/level0/io/bli_scal2jios.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JIOS_H +#define BLIS_SCAL2JIOS_H + +// scal2jios + +#define bli_cscal2jios( a, x, yi ) \ +{ \ + (yi) = bli_cimag(a) * bli_creal(x) - bli_creal(a) * bli_cimag(x); \ +} + +#define bli_zscal2jios( a, x, yi ) \ +{ \ + (yi) = bli_zimag(a) * bli_zreal(x) - bli_zreal(a) * bli_zimag(x); \ +} + + +#endif + diff --git a/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h b/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h new file mode 100644 index 000000000..39f270820 --- /dev/null +++ b/frame/include/level0/rih/bli_scal2rihs_mxn_diag.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RIHS_MXN_DIAG_H +#define BLIS_SCAL2RIHS_MXN_DIAG_H + +// scal2rihs_mxn_diag + +#define bli_cscscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2ros( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2ios( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal2rpis( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#define bli_zdzscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2ros( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2ios( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal2rpis( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h b/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h new file mode 100644 index 000000000..38423dfcb --- /dev/null +++ b/frame/include/level0/rih/bli_scal2rihs_mxn_uplo.h @@ -0,0 +1,348 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RIHS_MXN_UPLO_H +#define BLIS_SCAL2RIHS_MXN_UPLO_H + +// scal2rihs_mxn_uplo + +#define bli_cscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ +} + +#define bli_zscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2ros( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2ios( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2jrpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2rpis( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_r + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/rih/bli_setrihs_mxn_diag.h b/frame/include/level0/rih/bli_setrihs_mxn_diag.h new file mode 100644 index 000000000..3fe2a8215 --- /dev/null +++ b/frame/include/level0/rih/bli_setrihs_mxn_diag.h @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETRIHS_MXN_DIAG_H +#define BLIS_SETRIHS_MXN_DIAG_H + +// setrihs_mxn_diag + +#define bli_csetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ +{ \ + const float a_r = bli_zreal( *a ); \ + const float a_i = bli_zimag( *a ); \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scopys( (a_r), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scopys( (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_sadd3s( (a_r), \ + (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#define bli_zsetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ +{ \ + const double a_r = bli_zreal( *a ); \ + const double a_i = bli_zimag( *a ); \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle ro, io, and rpi separately. */ \ + if ( bli_is_ro_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dcopys( (a_r), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else if ( bli_is_io_packed( schema ) ) \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dcopys( (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_rpi_packed( schema ) ) */ \ + { \ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dadd3s( (a_r), \ + (a_i), \ + *(y_r + i*rs_y + i*cs_y) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/ro/bli_scal2jros.h b/frame/include/level0/ro/bli_scal2jros.h new file mode 100644 index 000000000..40cc87044 --- /dev/null +++ b/frame/include/level0/ro/bli_scal2jros.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JROS_H +#define BLIS_SCAL2JROS_H + +// scal2jros + +#define bli_cscal2jros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \ +} + +#define bli_zscal2jros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/ro/bli_scal2ros.h b/frame/include/level0/ro/bli_scal2ros.h new file mode 100644 index 000000000..95b48c198 --- /dev/null +++ b/frame/include/level0/ro/bli_scal2ros.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2ROS_H +#define BLIS_SCAL2ROS_H + +// scal2ros + +#define bli_cscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \ +} + +#define bli_zscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \ +} + +#define bli_scscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_creal(a) * bli_creal(x); \ +} + +#define bli_dzscal2ros( a, x, yr ) \ +{ \ + (yr) = bli_zreal(a) * bli_zreal(x); \ +} + + +#endif + diff --git a/frame/include/level0/rpi/bli_scal2jrpis.h b/frame/include/level0/rpi/bli_scal2jrpis.h new file mode 100644 index 000000000..bf930ad3f --- /dev/null +++ b/frame/include/level0/rpi/bli_scal2jrpis.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyrpiight (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyrpiight + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyrpiight + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JRPIS_H +#define BLIS_SCAL2JRPIS_H + +// scal2jrpis + +#define bli_cscal2jrpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ + (bli_cimag(a)-bli_creal(a)) * bli_cimag(x); \ +} + +#define bli_zscal2jrpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ + (bli_zimag(a)-bli_zreal(a)) * bli_zimag(x); \ +} + +#endif + diff --git a/frame/include/level0/rpi/bli_scal2rpis.h b/frame/include/level0/rpi/bli_scal2rpis.h new file mode 100644 index 000000000..6c4ee8857 --- /dev/null +++ b/frame/include/level0/rpi/bli_scal2rpis.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RPIS_H +#define BLIS_SCAL2RPIS_H + +// scal2rpis + +#define bli_cscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ + (bli_creal(a)-bli_cimag(a)) * bli_cimag(x); \ +} + +#define bli_zscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ + (bli_zreal(a)-bli_zimag(a)) * bli_zimag(x); \ +} + +#define bli_scscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = bli_creal(a) * bli_creal(x) + \ + bli_creal(a) * bli_cimag(x); \ +} + +#define bli_dzscal2rpis( a, x, yrpi ) \ +{ \ + (yrpi) = bli_zreal(a) * bli_zreal(x) + \ + bli_zreal(a) * bli_zimag(x); \ +} + + +#endif + diff --git a/testsuite/input.general b/testsuite/input.general index 0f52b46c7..df6b80442 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -23,6 +23,11 @@ sdcz # Datatype(s) to test: 100 # Problem size: first to test 300 # Problem size: maximum to test 100 # Problem size: increment between experiments + # Complex level-3 implementations +0 # 3mh ('1' = enable; '0' = disable) +0 # 3m ('1' = enable; '0' = disable) +0 # 4mh ('1' = enable; '0' = disable) +1 # 4m ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 0fadba172..7e749582a 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -51,6 +51,12 @@ char libblis_test_store_chars[ NUM_OPERAND_TYPES ][ MAX_STORE_VALS_PER_TYPE + 1 char libblis_test_param_chars[ NUM_PARAM_TYPES ][ MAX_PARAM_VALS_PER_TYPE + 1 ]; +//#define _3MH +//#define _4MH +//#define _3M +//#define _4M + + int main( int argc, char** argv ) { test_params_t params; @@ -59,6 +65,36 @@ int main( int argc, char** argv ) // Initialize libblis. bli_init(); + // Experimental. Set the complex implementations. +/* +#if defined _3MH + bli_3mh_enable(); + bli_3m_enable(); + bli_4mh_disable(); + bli_4m_enable(); +#elif defined _3M + bli_3mh_disable(); + bli_3m_enable(); + bli_4mh_enable(); + bli_4m_enable(); +#elif defined _4MH + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_enable(); + bli_4m_enable(); +#elif defined _4M + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_disable(); + bli_4m_enable(); +#else + bli_3mh_disable(); + bli_3m_disable(); + bli_4mh_disable(); + bli_4m_enable(); +#endif +*/ + // Initialize some strings. libblis_test_init_strings(); @@ -377,6 +413,22 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->p_inc) ); + // Read whether to enable 3mh. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_3mh) ); + + // Read whether to enable 3m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_3m) ); + + // Read whether to enable 4mh. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_4mh) ); + + // Read whether to enable 4m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->enable_4m) ); + // Read the requested error-checking level. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->error_checking_level) ); @@ -404,6 +456,16 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params // Close the file. fclose( input_stream ); + // Enable/disable the alternative complex implementations. + if ( params->enable_3mh ) bli_3mh_enable(); + else bli_3mh_disable(); + if ( params->enable_3m ) bli_3m_enable(); + else bli_3m_disable(); + if ( params->enable_4mh ) bli_4mh_enable(); + else bli_4mh_disable(); + if ( params->enable_4m ) bli_4m_enable(); + else bli_4m_disable(); + // Output the parameter struct. libblis_test_output_params_struct( stdout, params ); } @@ -595,108 +657,185 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS kernel header ---\n" ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); - libblis_test_fprintf_c( os, " sizes (bytes) %5u %5u %5u %5u\n", sizeof(float), + libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); + libblis_test_fprintf_c( os, " sizes (bytes) %7u %7u %7u %7u\n", sizeof(float), sizeof(double), sizeof(scomplex), sizeof(dcomplex) ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "complex via 4m\n" ); - libblis_test_fprintf_c( os, " enabled for scomplex? %d\n", ( int )bli_info_get_enable_scomplex_via_4m() ); - libblis_test_fprintf_c( os, " enabled for dcomplex? %d\n", ( int )bli_info_get_enable_dcomplex_via_4m() ); - libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 def cache blkszes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 def cache blkszes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_mc_s(), ( int )bli_info_get_default_mc_d(), ( int )bli_info_get_default_mc_c(), ( int )bli_info_get_default_mc_z() ); - libblis_test_fprintf_c( os, " k dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " k dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_kc_s(), ( int )bli_info_get_default_kc_d(), ( int )bli_info_get_default_kc_c(), ( int )bli_info_get_default_kc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_nc_s(), ( int )bli_info_get_default_nc_d(), ( int )bli_info_get_default_nc_c(), ( int )bli_info_get_default_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 max cache blkszes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 max cache blkszes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_mc_s(), ( int )bli_info_get_maximum_mc_d(), ( int )bli_info_get_maximum_mc_c(), ( int )bli_info_get_maximum_mc_z() ); - libblis_test_fprintf_c( os, " k dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " k dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_kc_s(), ( int )bli_info_get_maximum_kc_d(), ( int )bli_info_get_maximum_kc_c(), ( int )bli_info_get_maximum_kc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_maximum_nc_s(), ( int )bli_info_get_maximum_nc_d(), ( int )bli_info_get_maximum_nc_c(), ( int )bli_info_get_maximum_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_mr_s(), ( int )bli_info_get_default_mr_d(), ( int )bli_info_get_default_mr_c(), ( int )bli_info_get_default_mr_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_nr_s(), ( int )bli_info_get_default_nr_d(), ( int )bli_info_get_default_nr_c(), ( int )bli_info_get_default_nr_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-3 pack register blksz s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-3 pack register blksz s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_packdim_mr_s(), ( int )bli_info_get_packdim_mr_d(), ( int )bli_info_get_packdim_mr_c(), ( int )bli_info_get_packdim_mr_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_packdim_nr_s(), ( int )bli_info_get_packdim_nr_d(), ( int )bli_info_get_packdim_nr_c(), ( int )bli_info_get_packdim_nr_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); - libblis_test_fprintf_c( os, " m dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); + libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_l2_mc_s(), ( int )bli_info_get_default_l2_mc_d(), ( int )bli_info_get_default_l2_mc_c(), ( int )bli_info_get_default_l2_mc_z() ); - libblis_test_fprintf_c( os, " n dimension %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " n dimension %7d %7d %7d %7d\n", ( int )bli_info_get_default_l2_nc_s(), ( int )bli_info_get_default_l2_nc_d(), ( int )bli_info_get_default_l2_nc_c(), ( int )bli_info_get_default_l2_nc_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" ); - libblis_test_fprintf_c( os, " default %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" ); + libblis_test_fprintf_c( os, " default %7d %7d %7d %7d\n", ( int )bli_info_get_default_l1f_fuse_fac_s(), ( int )bli_info_get_default_l1f_fuse_fac_d(), ( int )bli_info_get_default_l1f_fuse_fac_c(), ( int )bli_info_get_default_l1f_fuse_fac_z() ); - libblis_test_fprintf_c( os, " axpyf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " axpyf %7d %7d %7d %7d\n", ( int )bli_info_get_axpyf_fuse_fac_s(), ( int )bli_info_get_axpyf_fuse_fac_d(), ( int )bli_info_get_axpyf_fuse_fac_c(), ( int )bli_info_get_axpyf_fuse_fac_z() ); - libblis_test_fprintf_c( os, " dotxf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " dotxf %7d %7d %7d %7d\n", ( int )bli_info_get_dotxf_fuse_fac_s(), ( int )bli_info_get_dotxf_fuse_fac_d(), ( int )bli_info_get_dotxf_fuse_fac_c(), ( int )bli_info_get_dotxf_fuse_fac_z() ); - libblis_test_fprintf_c( os, " dotxaxpyf %5d %5d %5d %5d\n", + libblis_test_fprintf_c( os, " dotxaxpyf %7d %7d %7d %7d\n", ( int )bli_info_get_dotxaxpyf_fuse_fac_s(), ( int )bli_info_get_dotxaxpyf_fuse_fac_d(), ( int )bli_info_get_dotxaxpyf_fuse_fac_c(), ( int )bli_info_get_dotxaxpyf_fuse_fac_z() ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "micro-kernel types s d c z\n" ); + libblis_test_fprintf_c( os, " gemm %7s %7s %7s %7s\n", + bli_info_get_gemm_ukr_type( BLIS_FLOAT ), + bli_info_get_gemm_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemm_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemm_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " gemmtrsm_l %7s %7s %7s %7s\n", + bli_info_get_gemmtrsm_l_ukr_type( BLIS_FLOAT ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemmtrsm_l_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " gemmtrsm_u %7s %7s %7s %7s\n", + bli_info_get_gemmtrsm_u_ukr_type( BLIS_FLOAT ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_DOUBLE ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_gemmtrsm_u_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm_l %7s %7s %7s %7s\n", + bli_info_get_trsm_l_ukr_type( BLIS_FLOAT ), + bli_info_get_trsm_l_ukr_type( BLIS_DOUBLE ), + bli_info_get_trsm_l_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_trsm_l_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm_u %7s %7s %7s %7s\n", + bli_info_get_trsm_u_ukr_type( BLIS_FLOAT ), + bli_info_get_trsm_u_ukr_type( BLIS_DOUBLE ), + bli_info_get_trsm_u_ukr_type( BLIS_SCOMPLEX ), + bli_info_get_trsm_u_ukr_type( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "--- BLIS implementation details ---\n" ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "level-3 implementations s d c z\n" ); + libblis_test_fprintf_c( os, " gemm %7s %7s %7s %7s\n", + bli_info_get_gemm_impl_string( BLIS_FLOAT ), + bli_info_get_gemm_impl_string( BLIS_DOUBLE ), + bli_info_get_gemm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_gemm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " hemm %7s %7s %7s %7s\n", + bli_info_get_hemm_impl_string( BLIS_FLOAT ), + bli_info_get_hemm_impl_string( BLIS_DOUBLE ), + bli_info_get_hemm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_hemm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " herk %7s %7s %7s %7s\n", + bli_info_get_herk_impl_string( BLIS_FLOAT ), + bli_info_get_herk_impl_string( BLIS_DOUBLE ), + bli_info_get_herk_impl_string( BLIS_SCOMPLEX ), + bli_info_get_herk_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " her2k %7s %7s %7s %7s\n", + bli_info_get_her2k_impl_string( BLIS_FLOAT ), + bli_info_get_her2k_impl_string( BLIS_DOUBLE ), + bli_info_get_her2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_her2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " symm %7s %7s %7s %7s\n", + bli_info_get_symm_impl_string( BLIS_FLOAT ), + bli_info_get_symm_impl_string( BLIS_DOUBLE ), + bli_info_get_symm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_symm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " syrk %7s %7s %7s %7s\n", + bli_info_get_syrk_impl_string( BLIS_FLOAT ), + bli_info_get_syrk_impl_string( BLIS_DOUBLE ), + bli_info_get_syrk_impl_string( BLIS_SCOMPLEX ), + bli_info_get_syrk_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " syr2k %7s %7s %7s %7s\n", + bli_info_get_syr2k_impl_string( BLIS_FLOAT ), + bli_info_get_syr2k_impl_string( BLIS_DOUBLE ), + bli_info_get_syr2k_impl_string( BLIS_SCOMPLEX ), + bli_info_get_syr2k_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trmm %7s %7s %7s %7s\n", + bli_info_get_trmm_impl_string( BLIS_FLOAT ), + bli_info_get_trmm_impl_string( BLIS_DOUBLE ), + bli_info_get_trmm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trmm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trmm3 %7s %7s %7s %7s\n", + bli_info_get_trmm3_impl_string( BLIS_FLOAT ), + bli_info_get_trmm3_impl_string( BLIS_DOUBLE ), + bli_info_get_trmm3_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trmm3_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, " trsm %7s %7s %7s %7s\n", + bli_info_get_trsm_impl_string( BLIS_FLOAT ), + bli_info_get_trsm_impl_string( BLIS_DOUBLE ), + bli_info_get_trsm_impl_string( BLIS_SCOMPLEX ), + bli_info_get_trsm_impl_string( BLIS_DCOMPLEX ) ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf( os, "\n" ); // Output the contents of the param struct. @@ -719,6 +858,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); + libblis_test_fprintf_c( os, "enable 3mh? %u\n", params->enable_3mh ); + libblis_test_fprintf_c( os, "enable 3m? %u\n", params->enable_3m ); + libblis_test_fprintf_c( os, "enable 4mh? %u\n", params->enable_4mh ); + libblis_test_fprintf_c( os, "enable 4m? %u\n", params->enable_4m ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); libblis_test_fprintf_c( os, "output in matlab format? %u\n", params->output_matlab_format ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index 1609b0c66..31dbf429d 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -160,6 +160,10 @@ typedef struct unsigned int p_first; unsigned int p_max; unsigned int p_inc; + unsigned int enable_3mh; + unsigned int enable_3m; + unsigned int enable_4mh; + unsigned int enable_4m; char reaction_to_failure; unsigned int output_matlab_format; unsigned int output_files;