From c2c91e09b4893cb81314774557f728a95080f81e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Oct 2016 21:15:26 -0700 Subject: [PATCH 01/23] never use libm with Intel compilers Intel compilers include a highly optimized math library (libimf) that should be used instead of GNU libm. yes, this change is for ALL targets, including those that are not supported by the Intel compiler. there is no harm in doing this, and it is future-proof in the event that the Intel compilers support other architectures. --- config/armv7a/make_defs.mk | 2 ++ config/armv8a/make_defs.mk | 2 ++ config/bulldozer/make_defs.mk | 2 ++ config/carrizo/make_defs.mk | 2 ++ config/cortex-a15/make_defs.mk | 2 ++ config/cortex-a9/make_defs.mk | 2 ++ config/dunnington/make_defs.mk | 2 ++ config/haswell/make_defs.mk | 2 ++ config/knl/make_defs.mk | 6 +++++- config/loongson3a/make_defs.mk | 2 ++ config/mic/make_defs.mk | 4 ++++ config/piledriver/make_defs.mk | 2 ++ config/pnacl/make_defs.mk | 2 ++ config/power7/make_defs.mk | 2 ++ config/reference/make_defs.mk | 2 ++ config/sandybridge/make_defs.mk | 2 ++ config/template/make_defs.mk | 2 ++ 17 files changed, 39 insertions(+), 1 deletion(-) diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 40b6c179a..2b4125f3a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 654a9ff92..3dc88e913 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 78f47d908..90d14d56b 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index aaecb2d2c..fd6b84cb0 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index ec5360da4..52ab7a7c9 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index ec5360da4..52ab7a7c9 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index fed36506b..f8faa3b5b 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 1640a40b9..4c144846d 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index e0385e6d5..6a750223d 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -95,7 +95,11 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -lmemkind +ifeq ($(CC_VENDOR),icc) +LDFLAGS := -lmemkind +else +LDFLAGS := -lmemkind -lm +endif diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index bb1248d37..2c7e9c58c 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 21af9e2e2..339112570 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -77,7 +77,11 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifeq ($(CC_VENDOR),icc) +LDFLAGS := -mmic +else LDFLAGS := -mmic -lm +endif diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index e241789dd..db46bd124 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e957cf429..9e2a3b4c5 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -63,7 +63,9 @@ ARFLAGS := rcs # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif # --- Determine the finalizer and related flags --- FINALIZER := pnacl-finalize diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index d03857a44..da4e5bff1 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 736e5ee4d..f2f86ba07 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 082a73f92..0a779b188 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 37de32882..98f3222e0 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif From 126482a3b609b9ad7026ba348f6c4bf6a29be8a1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 25 Nov 2016 18:29:49 -0600 Subject: [PATCH 02/23] Implemented the 1m method. Details: - Implemented the 1m method for inducing complex domain matrix multiplication. 1m support has been added to all level-3 operations, including trsm, and is now the default induced method when native complex domain gemm microkernels are omitted from the configuration. - Updated _cntx_init() operations to take a datatype parameter. This was needed for the corresponding function for 1m (because 1m requires us to choose between column-oriented or row-oriented execution, which requires us to query the context for the storage preference of the gemm microkernel, which requires knowing the datatype) but I decided that it made sense for consistency to add the parameter to all other cntx initialization functions as well, even though those functions don't use the parameter. - Updated bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs() to take a second scalar for each blocksize entry. The semantic meaning of the two scalars now is that the first will scale the default blocksize while the second will scale the maximum blocksize. This allows scaling the two independently, and was needed to support 1m, which requires scaling for a register blocksize but not the register storage blocksize (ie: "packdim") analogue. - Deprecated bli_blksz_reduce_dt_to() and defined two new functions, bli_blksz_reduce_def_to() and bli_blksz_reduce_max_to(), for reducing default and maximum blocksizes to some desired blocksize multiple. These functions are needed in the updated definitions of bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs(). - Added support for the 1e and 1r packing schemas to packm, including 1e/1r packing kernels. - Added a minor optimization to bli_gemm_ker_var2() that allows, under certain circumstances (specifically, real domain beta and row- or column-stored matrix C), the real domain macrokernel and microkernel to be called directly, rather than using the virtual microkernel via the complex domain macrokernel, which carries a slight additional amount of overhead. - Added 1m support to the testsuite. - Added 1m support to Makefile and runme.sh in test/3m4m. Also simplified some code in test_gemm.c driver. --- config/haswell/bli_kernel.h | 44 +- frame/1/bli_l1v_cntx.c | 22 +- frame/1/bli_l1v_cntx.h | 2 +- frame/1/bli_l1v_tapi.c | 20 +- frame/1d/bli_l1d_cntx.c | 4 +- frame/1d/bli_l1d_cntx.h | 2 +- frame/1d/bli_l1d_tapi.c | 10 +- frame/1f/bli_l1f_cntx.c | 20 +- frame/1f/bli_l1f_cntx.h | 2 +- frame/1f/bli_l1f_tapi.c | 10 +- frame/1m/bli_l1m_cntx.c | 10 +- frame/1m/bli_l1m_cntx.h | 2 +- frame/1m/bli_l1m_ft.h | 1 + frame/1m/bli_l1m_tapi.c | 25 +- frame/1m/packm/bli_packm.h | 2 + frame/1m/packm/bli_packm_blk_var1.c | 6 + frame/1m/packm/bli_packm_cntx.c | 2 +- frame/1m/packm/bli_packm_cntx.h | 2 +- frame/1m/packm/bli_packm_cxk_1er.c | 489 +++++++ frame/1m/packm/bli_packm_cxk_1er.h | 55 + frame/1m/packm/bli_packm_struc_cxk_1er.c | 610 ++++++++ frame/1m/packm/bli_packm_struc_cxk_1er.h | 117 ++ .../1m/packm/ukernels/bli_packm_cxk_1e_ref.c | 1099 +++++++++++++++ .../1m/packm/ukernels/bli_packm_cxk_1e_ref.h | 62 + .../1m/packm/ukernels/bli_packm_cxk_1r_ref.c | 1254 +++++++++++++++++ .../1m/packm/ukernels/bli_packm_cxk_1r_ref.h | 61 + frame/2/bli_l2_cntx.c | 50 +- frame/2/bli_l2_cntx.h | 2 +- frame/2/bli_l2_tapi.c | 39 +- frame/3/bli_l3_cntx.c | 8 +- frame/3/bli_l3_cntx.h | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 20 + frame/base/bli_blksz.c | 61 + frame/base/bli_blksz.h | 25 + frame/base/bli_cntx.c | 147 +- frame/base/bli_cntx.h | 8 +- frame/base/bli_gks.c | 173 ++- frame/base/bli_memsys.c | 6 +- frame/include/bli_param_macro_defs.h | 13 + frame/include/bli_scalar_macro_defs.h | 37 + frame/include/bli_type_defs.h | 17 +- frame/include/level0/1e/bli_copy1es.h | 53 + frame/include/level0/1e/bli_copyj1es.h | 53 + frame/include/level0/1e/bli_invert1es.h | 53 + frame/include/level0/1e/bli_scal1es.h | 53 + frame/include/level0/1e/bli_scal21es.h | 65 + frame/include/level0/1e/bli_scal2j1es.h | 65 + .../level0/1m/bli_invert1ms_mxn_diag.h | 126 ++ frame/include/level0/1m/bli_scal1ms_mxn.h | 124 ++ .../include/level0/1m/bli_scal21ms_mxn_diag.h | 126 ++ .../include/level0/1m/bli_scal21ms_mxn_uplo.h | 296 ++++ frame/include/level0/1m/bli_set1ms_mxn.h | 164 +++ frame/include/level0/1m/bli_set1ms_mxn_diag.h | 130 ++ frame/include/level0/1m/bli_set1ms_mxn_uplo.h | 198 +++ .../include/level0/1m/bli_seti01ms_mxn_diag.h | 114 ++ frame/include/level0/1r/bli_copy1rs.h | 51 + frame/include/level0/1r/bli_copyj1rs.h | 51 + frame/include/level0/1r/bli_invert1rs.h | 43 + frame/include/level0/1r/bli_scal1rs.h | 61 + frame/include/level0/1r/bli_scal21rs.h | 61 + frame/include/level0/1r/bli_scal2j1rs.h | 61 + frame/ind/bli_ind.c | 7 +- frame/ind/bli_ind.h | 3 + frame/ind/bli_l3_ind.c | 4 + frame/ind/cntx/bli_gemmind_cntx.c | 290 ++-- frame/ind/cntx/bli_gemmind_cntx.h | 45 +- frame/ind/cntx/bli_trsmind_cntx.c | 127 +- frame/ind/cntx/bli_trsmind_cntx.h | 26 +- frame/ind/include/bli_kernel_1m_macro_defs.h | 107 ++ frame/ind/include/bli_kernel_ind_macro_defs.h | 2 + .../include/bli_kernel_ind_pre_macro_defs.h | 29 + frame/ind/include/bli_packm_1er_macro_defs.h | 241 ++++ frame/ind/include/bli_packm_3mis_macro_defs.h | 3 - frame/ind/include/bli_packm_4mi_macro_defs.h | 3 - .../include/bli_packm_ind_pre_macro_defs.h | 97 ++ frame/ind/include/bli_packm_rih_macro_defs.h | 3 - frame/ind/misc/bli_l3_ind_opt.h | 78 + ...li_l3_3m4m_oapi.c => bli_l3_3m4m1m_oapi.c} | 39 +- frame/ind/oapi/bli_l3_ind_oapi.h | 1 + frame/ind/oapi/bli_l3_nat_oapi.c | 15 +- frame/ind/tapi/bli_l3_ind_tapi.c | 10 + frame/ind/tapi/bli_l3_ind_tapi.h | 10 + frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 179 +++ frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h | 1 + .../ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c | 244 ++++ .../ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h | 3 + frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c | 448 ++++++ .../ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h | 3 + .../trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c | 240 ---- .../trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c | 222 --- .../trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c | 215 --- .../trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c | 203 --- .../ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c | 160 --- .../ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c | 160 --- .../ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c | 169 --- .../ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c | 152 -- frame/util/bli_util_tapi.c | 14 +- test/3m4m/Makefile | 33 +- test/3m4m/runme.sh | 6 +- test/3m4m/test_gemm.c | 24 +- testsuite/input.general | 5 +- testsuite/src/test_axpy2v.c | 2 +- testsuite/src/test_axpyf.c | 2 +- testsuite/src/test_dotaxpyv.c | 2 +- testsuite/src/test_dotxaxpyf.c | 2 +- testsuite/src/test_dotxf.c | 2 +- testsuite/src/test_gemm.c | 2 - testsuite/src/test_gemm_ukr.c | 2 +- testsuite/src/test_gemmtrsm_ukr.c | 2 +- testsuite/src/test_libblis.c | 68 +- testsuite/src/test_trsm_ukr.c | 2 +- version | 2 +- 112 files changed, 8158 insertions(+), 2040 deletions(-) create mode 100644 frame/1m/packm/bli_packm_cxk_1er.c create mode 100644 frame/1m/packm/bli_packm_cxk_1er.h create mode 100644 frame/1m/packm/bli_packm_struc_cxk_1er.c create mode 100644 frame/1m/packm/bli_packm_struc_cxk_1er.h create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h create mode 100644 frame/include/level0/1e/bli_copy1es.h create mode 100644 frame/include/level0/1e/bli_copyj1es.h create mode 100644 frame/include/level0/1e/bli_invert1es.h create mode 100644 frame/include/level0/1e/bli_scal1es.h create mode 100644 frame/include/level0/1e/bli_scal21es.h create mode 100644 frame/include/level0/1e/bli_scal2j1es.h create mode 100644 frame/include/level0/1m/bli_invert1ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_scal1ms_mxn.h create mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_uplo.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn_uplo.h create mode 100644 frame/include/level0/1m/bli_seti01ms_mxn_diag.h create mode 100644 frame/include/level0/1r/bli_copy1rs.h create mode 100644 frame/include/level0/1r/bli_copyj1rs.h create mode 100644 frame/include/level0/1r/bli_invert1rs.h create mode 100644 frame/include/level0/1r/bli_scal1rs.h create mode 100644 frame/include/level0/1r/bli_scal21rs.h create mode 100644 frame/include/level0/1r/bli_scal2j1rs.h create mode 100644 frame/ind/include/bli_kernel_1m_macro_defs.h create mode 100644 frame/ind/include/bli_packm_1er_macro_defs.h create mode 100644 frame/ind/misc/bli_l3_ind_opt.h rename frame/ind/oapi/{bli_l3_3m4m_oapi.c => bli_l3_3m4m1m_oapi.c} (92%) create mode 100644 frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c create mode 100644 frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c create mode 100644 frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 6eee7c483..ce18dc266 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -51,17 +51,6 @@ // -- sgemm micro-kernel -- -#if 1 -#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 -#define BLIS_DEFAULT_MC_S 144 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 4080 -#define BLIS_DEFAULT_MR_S 6 -#define BLIS_DEFAULT_NR_S 16 - -#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 @@ -73,6 +62,17 @@ #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 +#define BLIS_DEFAULT_MC_S 144 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 4080 +#define BLIS_DEFAULT_MR_S 6 +#define BLIS_DEFAULT_NR_S 16 + +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 @@ -84,17 +84,6 @@ // -- dgemm micro-kernel -- -#if 1 -#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 -#define BLIS_DEFAULT_MC_D 72 -#define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 4080 -#define BLIS_DEFAULT_MR_D 6 -#define BLIS_DEFAULT_NR_D 8 - -#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 @@ -106,6 +95,17 @@ #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 +#define BLIS_DEFAULT_MC_D 72 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4080 +#define BLIS_DEFAULT_MR_D 6 +#define BLIS_DEFAULT_NR_D 8 + +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index bdbb0063f..149c20320 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname, kertype ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ @@ -68,15 +68,15 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2, dep3, dep4 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ - PASTEMAC(dep3,_cntx_init)( cntx ); \ - PASTEMAC(dep4,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep3,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep4,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -93,12 +93,12 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -116,13 +116,13 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index 95cd4a131..85756363b 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 74a548eea..6abf002f5 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -53,7 +53,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -88,7 +88,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -123,7 +123,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -198,7 +198,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -274,7 +274,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -306,7 +306,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -340,7 +340,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -373,7 +373,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index d285995b1..443dc20f7 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1d/bli_l1d_cntx.h b/frame/1d/bli_l1d_cntx.h index 50db79738..e5ab92f51 100644 --- a/frame/1d/bli_l1d_cntx.h +++ b/frame/1d/bli_l1d_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addd ) diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index 5ef92603a..c8a67a138 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -90,7 +90,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -166,7 +166,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -222,7 +222,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -276,7 +276,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -349,7 +349,7 @@ void PASTEMAC(ch,opname) \ incx = 2*incx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(chr,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx_p ); \ diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 379cbce7d..58ca4a07c 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -63,13 +63,13 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -86,12 +86,12 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -114,13 +114,13 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1f/bli_l1f_cntx.h b/frame/1f/bli_l1f_cntx.h index 86b3af25f..bea56ca40 100644 --- a/frame/1f/bli_l1f_cntx.h +++ b/frame/1f/bli_l1f_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( axpy2v ) diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index a7efd91f8..8c77a2465 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -99,7 +99,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -142,7 +142,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 8569416fd..7eb3dcd4c 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ @@ -64,13 +64,13 @@ GENFRONT( subm, subv ) #undef GENFRONT #define GENFRONT( opname, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1m/bli_l1m_cntx.h b/frame/1m/bli_l1m_cntx.h index 46524fa0b..79e0524e8 100644 --- a/frame/1m/bli_l1m_cntx.h +++ b/frame/1m/bli_l1m_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addm ) diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 4361c9fac..2790bd006 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -91,6 +91,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ ); INSERT_GENTDEF( packm_cxk_ker ) +INSERT_GENTDEF( packm_cxk_1er_ker ) // packm_3mis_ker diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index c4dc5f9a8..13da24e59 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -54,12 +54,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -118,12 +119,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -187,7 +189,8 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -195,7 +198,7 @@ void PASTEMAC(ch,opname) \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -256,12 +259,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If alpha is zero, then we set the output matrix to zero. This seemingly minor optimization is important because it will clear @@ -344,12 +348,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 7a44ecb9f..991487dfd 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -48,9 +48,11 @@ #include "bli_packm_struc_cxk_4mi.h" #include "bli_packm_struc_cxk_3mis.h" #include "bli_packm_struc_cxk_rih.h" +#include "bli_packm_struc_cxk_1er.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4mi.h" #include "bli_packm_cxk_3mis.h" #include "bli_packm_cxk_rih.h" +#include "bli_packm_cxk_1er.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 4ce7b1504..055d30f1f 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -90,6 +90,12 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = // 0111 row/col panels: real+imaginary only { { NULL, bli_cpackm_struc_cxk_rih, NULL, bli_zpackm_struc_cxk_rih, } }, +// 1000 row/col panels: 1m-expanded (1e) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, +// 1001 row/col panels: 1m-reordered (1r) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, }; diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 4f570400a..2f4e0b030 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -39,7 +39,7 @@ // Define context initialization functions. // -void bli_packm_cntx_init( cntx_t* cntx ) +void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { bli_cntx_obj_create( cntx ); diff --git a/frame/1m/packm/bli_packm_cntx.h b/frame/1m/packm/bli_packm_cntx.h index 1ab4df826..2210a777b 100644 --- a/frame/1m/packm/bli_packm_cntx.h +++ b/frame/1m/packm/bli_packm_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( packm ) diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/packm/bli_packm_cxk_1er.c new file mode 100644 index 000000000..352ae8353 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.c @@ -0,0 +1,489 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_1er_ker_vft + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 32 + +static FUNCPTR_T ftypes_e[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1E_KERNEL, + NULL, BLIS_ZPACKM_2XK_1E_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1E_KERNEL, + NULL, BLIS_ZPACKM_4XK_1E_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1E_KERNEL, + NULL, BLIS_ZPACKM_6XK_1E_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1E_KERNEL, + NULL, BLIS_ZPACKM_8XK_1E_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1E_KERNEL, + NULL, BLIS_ZPACKM_10XK_1E_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1E_KERNEL, + NULL, BLIS_ZPACKM_12XK_1E_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1E_KERNEL, + NULL, BLIS_ZPACKM_14XK_1E_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1E_KERNEL, + NULL, BLIS_ZPACKM_16XK_1E_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1E_KERNEL, + NULL, BLIS_ZPACKM_30XK_1E_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + +static FUNCPTR_T ftypes_r[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1R_KERNEL, + NULL, BLIS_ZPACKM_2XK_1R_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, BLIS_CPACKM_3XK_1R_KERNEL, + NULL, BLIS_ZPACKM_3XK_1R_KERNEL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1R_KERNEL, + NULL, BLIS_ZPACKM_4XK_1R_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1R_KERNEL, + NULL, BLIS_ZPACKM_6XK_1R_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1R_KERNEL, + NULL, BLIS_ZPACKM_8XK_1R_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1R_KERNEL, + NULL, BLIS_ZPACKM_10XK_1R_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1R_KERNEL, + NULL, BLIS_ZPACKM_12XK_1R_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1R_KERNEL, + NULL, BLIS_ZPACKM_14XK_1R_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1R_KERNEL, + NULL, BLIS_ZPACKM_16XK_1R_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1R_KERNEL, + NULL, BLIS_ZPACKM_30XK_1R_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the micro-panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) \ + { \ + if ( bli_is_1e_packed( schema ) ) f = ftypes_e[panel_dim][dt]; \ + else /*( bli_is_1r_packed( schema ) )*/ f = ftypes_r[panel_dim][dt]; \ + } \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f \ + ( \ + conja, \ + panel_len, \ + kappa, \ + a, inca, lda, \ + p, ldp \ + ); \ + } \ + else \ + { \ + dim_t i, j; \ +\ + if ( bli_is_1e_packed( schema ) ) \ + { \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict a_ri = ( ctype* )a; \ + ctype* restrict p_ri = ( ctype* )p; \ + ctype* restrict p_ir = ( ctype* )p + ldp/2; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2j1es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal21es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + 1; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype_r* restrict p_i = ( ctype_r* )p + ldp; \ + const dim_t inca2 = 2*inca; \ + const dim_t lda2 = 2*lda; \ + const dim_t ldp2 = 2*ldp; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2jris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2ris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/packm/bli_packm_cxk_1er.h new file mode 100644 index 000000000..bd87216d0 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_cxk_1e_ref.h" +#include "bli_packm_cxk_1r_ref.h" + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c new file mode 100644 index 000000000..6ed34808f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -0,0 +1,610 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + schema, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + { \ + if ( m_panel != m_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = m_panel; \ + dim_t offn = 0; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = 0; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = m_panel; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +\ +\ +/* + if ( bli_is_1r_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ + \ + if ( bli_is_1e_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype* restrict c10; \ + ctype* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp, \ + cntx \ + ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp, \ + cntx \ + ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t j = diagoffc_abs; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype* restrict p11 = p + (j )*ldp; \ +\ + PASTEMAC(ch,scal21ms_mxn_uplo) \ + ( \ + schema, \ + uploc, \ + conjc, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11, rs_p, cs_p, ldp \ + ); \ +\ + /* If we are packing a micro-panel with Hermitian structure, + we must take special care of the diagonal. Now, if kappa + were guaranteed to be unit, all we would need to do is + explicitly zero out the imaginary part of the diagonal of + p11, in case the diagonal of the source matrix contained + garbage (non-zero) imaginary values. HOWEVER, since kappa + can be non-unit, things become a little more complicated. + In general, we must re-apply the kappa scalar to ONLY the + real part of the diagonal of the source matrix and save + the result to the diagonal of p11. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + ctype_r* restrict c11_r = ( ctype_r* )c11; \ + const dim_t rs_c2 = 2*rs_c; \ + const dim_t cs_c2 = 2*cs_c; \ +\ + PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ + ( \ + schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + c11_r, rs_c2, cs_c2, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffp_abs = bli_abs( diagoffp ); \ + ctype* p11 = p + (diagoffp_abs )*ldp; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + kappa, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + PASTEMAC(ch,invert1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + uplo_t uplop = uploc; \ + doff_t diagoffp11_0 = 0; \ + dim_t p11_0_dim = panel_dim - 1; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp11_0 ); \ +\ + /* Note that this macro works a little differently than the setm + operation. Here, we pass in the dimensions of only p11, rather + than the whole micro-panel, and furthermore we pass in the + "shrunken" dimensions of p11, corresponding to the toggling + and shrinking of the diagonal above. The macro will do the + right thing, incrementing the pointer to p11 by the appropriate + leading dimension (cs_p or rs_p), and setting only the lower + or upper triangle to zero. */ \ + PASTEMAC(ch,set1ms_mxn_uplo) \ + ( \ + schema, \ + diagoffp11_0, \ + uplop, \ + p11_0_dim, \ + p11_0_dim, \ + zero, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h new file mode 100644 index 000000000..b0b1d0a2f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_1er ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c new file mode 100644 index 000000000..9f2acdce8 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c @@ -0,0 +1,1099 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1e_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h new file mode 100644 index 000000000..beebdafdc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1e_ref +// 1e format should probably never have an odd-numbered register blocking. +//#undef packm_3xk_ker_name +//#define packm_3xk_ker_name packm_3xk_1e_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1e_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1e_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1e_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1e_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1e_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1e_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1e_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1e_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c new file mode 100644 index 000000000..6e30ca5bc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c @@ -0,0 +1,1254 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_3xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1r_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h new file mode 100644 index 000000000..a6e3f0aef --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1r_ref +#undef packm_3xk_ker_name +#define packm_3xk_ker_name packm_3xk_1r_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1r_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1r_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1r_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1r_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1r_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1r_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1r_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1r_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index 841217365..fdfe27a85 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -50,20 +50,20 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -88,7 +88,7 @@ GENFRONT( trsv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -96,10 +96,10 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /* Initialize the context with kernels employed by the current operation. */ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -122,7 +122,7 @@ GENFRONT( syr ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -133,22 +133,22 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXAXPYF_KER, cntx );*/ \ - bli_dotaxpyv_cntx_init( cntx ); \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ - bli_dotxaxpyf_cntx_init( cntx ); \ + bli_dotaxpyv_cntx_init( dt, cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ + bli_dotxaxpyf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -173,7 +173,7 @@ GENFRONT( symv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -182,11 +182,11 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPY2V_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpy2v_cntx_init( cntx ); \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpy2v_cntx_init( dt, cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ diff --git a/frame/2/bli_l2_cntx.h b/frame/2/bli_l2_cntx.h index 8b6566f55..a3bafa0c8 100644 --- a/frame/2/bli_l2_cntx.h +++ b/frame/2/bli_l2_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemv ) diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c index 24558fd9d..f2681d7d8 100644 --- a/frame/2/bli_l2_tapi.c +++ b/frame/2/bli_l2_tapi.c @@ -55,8 +55,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - dim_t m_y, n_x; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + dim_t m_y, n_x; \ \ /* Determine the dimensions of y and x. */ \ bli_set_dims_with_trans( transa, m, n, m_y, n_x ); \ @@ -65,7 +66,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( m_y ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -135,13 +136,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x or y has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -188,10 +190,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -261,8 +264,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - ctype alpha_local; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + ctype alpha_local; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \ @@ -273,7 +277,7 @@ void PASTEMAC(ch,opname) \ PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -324,13 +328,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -383,13 +388,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -444,10 +450,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 634e4c1ab..8b4b01572 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -38,7 +38,7 @@ // Define context initialization functions. // -void bli_gemm_cntx_init( cntx_t* cntx ) +void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -49,7 +49,7 @@ void bli_gemm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. @@ -74,7 +74,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm_cntx_init( cntx_t* cntx ) +void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -92,7 +92,7 @@ void bli_trsm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. diff --git a/frame/3/bli_l3_cntx.h b/frame/3/bli_l3_cntx.h index 21b756656..223fa5e25 100644 --- a/frame/3/bli_l3_cntx.h +++ b/frame/3/bli_l3_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemm ) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 8af29594d..c27a0b67c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -109,6 +109,26 @@ void bli_gemm_ker_var2 buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. +#if 1 + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } +#endif + // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 833dadb42..0f8e38688 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -85,6 +85,7 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, @@ -116,6 +117,66 @@ void bli_blksz_reduce_dt_to bli_blksz_set_def( blksz_def, dt_bs, blksz ); bli_blksz_set_max( blksz_max, dt_bs, blksz ); } +#endif + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the default and maximum blocksize values down to their + // respective nearest multiples of bmult_val. (Notice that we + // ignore the "max" entry in the bmult object since that would + // correspond to the packing dimension, which plays no role + // as a blocksize multiple.) + blksz_def = ( blksz_def / bmult_val ) * bmult_val; + + // Make sure the new blocksize values are at least the blocksize + // multiple. + if ( blksz_def == 0 ) blksz_def = bmult_val; + + // Store the new blocksizes back to the object. + bli_blksz_set_def( blksz_def, dt_bs, blksz ); +} + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the blocksize values down to its nearest multiple of + // of bmult_val. (Notice that we ignore the "max" entry in the + // bmult object since that would correspond to the packing + // dimension, which plays no role as a blocksize multiple.) + blksz_max = ( blksz_max / bmult_val ) * bmult_val; + + // Make sure the new blocksize value is at least the blocksize + // multiple. + if ( blksz_max == 0 ) blksz_max = bmult_val; + + // Store the new blocksize back to the object. + bli_blksz_set_max( blksz_max, dt_bs, blksz ); +} // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index daffb3772..cfe2023e1 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -89,11 +89,23 @@ (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ } +#define bli_blksz_scale_def( num, den, dt, b ) \ +{ \ + (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ +} + +#define bli_blksz_scale_max( num, den, dt, b ) \ +{ \ + (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ +} + +#if 0 #define bli_blksz_scale_dt_by( num, den, dt, b ) \ { \ (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ } +#endif // ----------------------------------------------------------------------------- @@ -121,12 +133,25 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); +#endif +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index bd9972332..e4299eb49 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -386,27 +386,27 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_cntx_set_blkszs( + void bli_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; @@ -414,7 +414,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszid_t* bszids; blksz_t** blkszs; bszid_t* bmults; - dim_t* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; @@ -426,7 +427,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( dim_t ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -444,9 +446,9 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -464,18 +466,21 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); - const dim_t scalr = va_arg( args, dim_t ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -510,12 +515,12 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* blksz = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. Do the same @@ -534,14 +539,15 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; - const dim_t scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t* blksz = blkszs[ i ]; - blksz_t* bmult = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; + blksz_t* bmult = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. @@ -550,20 +556,50 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -575,7 +611,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( blkszs ); bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } #endif diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 38bff6720..9c97c3312 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -488,13 +488,13 @@ void bli_cntx_print( cntx_t* cntx ); // pointer is NULL. When initializing, the context address that should // be used (local or external) is assigned to cntx_p. -#define bli_cntx_init_local_if( opname, cntx, cntx_p ) \ +#define bli_cntx_init_local_if( opname, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC(opname,_cntx_init)( &_cntx_l ); \ + PASTEMAC(opname,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ @@ -510,13 +510,13 @@ void bli_cntx_print( cntx_t* cntx ); } -#define bli_cntx_init_local_if2( opname, suf, cntx, cntx_p ) \ +#define bli_cntx_init_local_if2( opname, suf, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC2(opname,suf,_cntx_init)( &_cntx_l ); \ + PASTEMAC2(opname,suf,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 7f3f897d5..32f99a832 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -94,48 +94,47 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_gks_cntx_set_blkszs( + void bli_gks_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, + bszid_t bs1_id, bszid_t bm1_id, + bszid_t bs2_id, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, - bszid_t bs1_id, bszid_t bm1_id, - bszid_t bs2_id, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_gks_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_gks_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; bszid_t* bszids; bszid_t* bmults; - double* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; blksz_t* cntx_blkszs; bszid_t* cntx_bmults; - bszid_t bs_id; - bszid_t bm_id; - double scalr; // Allocate some temporary local arrays. bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( double ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -152,8 +151,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -169,16 +168,19 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); - scalr = va_arg( args, double ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -210,10 +212,10 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blocksize multiple id. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and save // them directly into the appropriate location in the context's @@ -231,41 +233,75 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blocksize multiple id, // and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - double scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t blksz; - blksz_t bmult; + blksz_t blksz_l; + blksz_t bmult_l; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* blksz = &blksz_l; + blksz_t* bmult = &bmult_l; + + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and bm_id // and use them to populate a pair of local blksz_t objects. - bli_gks_get_blksz( bs_id, &blksz ); - bli_gks_get_blksz( bm_id, &bmult ); + bli_gks_get_blksz( bs_id, blksz ); + bli_gks_get_blksz( bm_id, bmult ); // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_FLOAT, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DOUBLE, cntx_blksz ); - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1.0 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, &bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, &bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -276,7 +312,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Free the temporary local arrays. bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } @@ -337,6 +374,18 @@ static func_t bli_gks_l3_ind_ukrs[BLIS_NUM_IND_METHODS] /* trsm_l */ { { NULL, BLIS_CTRSM4M1_L_UKERNEL, NULL, BLIS_ZTRSM4M1_L_UKERNEL, } }, /* trsm_u */ { { NULL, BLIS_CTRSM4M1_U_UKERNEL, NULL, BLIS_ZTRSM4M1_U_UKERNEL, } }, }, +/* 1m */ { +/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM1M_UKERNEL, + BLIS_DGEMM_UKERNEL, BLIS_ZGEMM1M_UKERNEL, } }, +/* gemmtrsm_l */ { { NULL, BLIS_CGEMMTRSM1M_L_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_L_UKERNEL, } }, +/* gemmtrsm_u */ { { NULL, BLIS_CGEMMTRSM1M_U_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_U_UKERNEL, } }, +/* trsm_l */ { { NULL, BLIS_CTRSM1M_L_UKERNEL, + NULL, BLIS_ZTRSM1M_L_UKERNEL, } }, +/* trsm_u */ { { NULL, BLIS_CTRSM1M_U_UKERNEL, + NULL, BLIS_ZTRSM1M_U_UKERNEL, } }, + }, /* nat */ { /* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL, } }, @@ -565,6 +614,8 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, // -- packm structure-aware kernel structure ----------------------------------- // +// IF ENABLED: NEEDS UPDATING FOR 1M. + static func_t bli_gks_packm_struc_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index e66aafa63..06cbae587 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -61,8 +61,10 @@ void bli_memsys_init( void ) if ( bli_memsys_is_init == TRUE ) return; // Create and initialize a context for gemm so we have something - // to pass into bli_membrk_init_pools(). - bli_gemm_cntx_init( &cntx ); + // to pass into bli_membrk_init_pools(). We use BLIS_DOUBLE for + // the datatype, but the dt argument is actually only used when + // initializing contexts for induced methods. + bli_gemm_cntx_init( BLIS_DOUBLE, &cntx ); #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 50ddd5d1f..f0a208886 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -654,6 +654,19 @@ bli_is_io_packed( schema ) || \ bli_is_rpi_packed( schema ) ) +#define bli_is_1r_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ) + +#define bli_is_1e_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ) + +#define bli_is_1m_packed( schema ) \ +\ + ( bli_is_1r_packed( schema ) || \ + bli_is_1e_packed( schema ) ) + #define bli_is_nat_packed( schema ) \ \ ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ) diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 1069a40b4..de8dbf370 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -225,6 +225,43 @@ #include "bli_scal2jrpis.h" +// -- 1m-specific scalar macros -- + +#include "bli_invert1ms_mxn_diag.h" + +#include "bli_scal1ms_mxn.h" + +#include "bli_scal21ms_mxn_diag.h" +#include "bli_scal21ms_mxn_uplo.h" + +#include "bli_set1ms_mxn.h" +#include "bli_set1ms_mxn_diag.h" +#include "bli_set1ms_mxn_uplo.h" +#include "bli_seti01ms_mxn_diag.h" + +// 1e +#include "bli_copy1es.h" +#include "bli_copyj1es.h" + +#include "bli_invert1es.h" + +#include "bli_scal1es.h" + +#include "bli_scal21es.h" +#include "bli_scal2j1es.h" + +// 1r +#include "bli_copy1rs.h" +#include "bli_copyj1rs.h" + +#include "bli_invert1rs.h" + +#include "bli_scal1rs.h" + +#include "bli_scal21rs.h" +#include "bli_scal2j1rs.h" + + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d3548031c..c4cfd3514 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -224,6 +224,10 @@ typedef dcomplex f77_dcomplex; - 1 0110 11: packed imag-only column panels - 1 0111 10: packed real+imag row panels - 1 0111 11: packed real+imag column panels + - 1 1000 10: packed by 1m expanded row panels + - 1 1000 11: packed by 1m expanded column panels + - 1 1001 10: packed by 1m reordered row panels + - 1 1001 11: packed by 1m reordered column panels 23 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper @@ -329,6 +333,8 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) @@ -348,6 +354,10 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -469,13 +479,17 @@ typedef enum BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, + BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, + BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the // schema pair for "4ms" (4m separated), because its bit value has // been reserved, even though we don't use it. -#define BLIS_NUM_PACK_SCHEMA_TYPES 8 +#define BLIS_NUM_PACK_SCHEMA_TYPES 10 // -- Pack order type -- @@ -575,6 +589,7 @@ typedef enum BLIS_4MH, BLIS_4M1B, BLIS_4M1A, + BLIS_1M, BLIS_NAT, } ind_t; diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h new file mode 100644 index 000000000..22eec1565 --- /dev/null +++ b/frame/include/level0/1e/bli_copy1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1ES_H +#define BLIS_COPY1ES_H + +// copy1es + +#define bli_ccopy1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopy1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h new file mode 100644 index 000000000..14c401354 --- /dev/null +++ b/frame/include/level0/1e/bli_copyj1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1ES_H +#define BLIS_COPYJ1ES_H + +// copyj1es + +#define bli_ccopyj1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopyj1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_invert1es.h b/frame/include/level0/1e/bli_invert1es.h new file mode 100644 index 000000000..2fe5c3f24 --- /dev/null +++ b/frame/include/level0/1e/bli_invert1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1ES_H +#define BLIS_INVERT1ES_H + +// invert1es + +#define bli_cinvert1es( bri, bir ) \ +{ \ + bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ +} + +#define bli_zinvert1es( bri, bir ) \ +{ \ + bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal1es.h b/frame/include/level0/1e/bli_scal1es.h new file mode 100644 index 000000000..46ee20a0d --- /dev/null +++ b/frame/include/level0/1e/bli_scal1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1ES_H +#define BLIS_SCAL1ES_H + +// scal1es + +#define bli_cscal1es( a, yri, yir ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ + bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal1es( a, yri, yir ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h new file mode 100644 index 000000000..7e0a752bc --- /dev/null +++ b/frame/include/level0/1e/bli_scal21es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21ES_H +#define BLIS_SCAL21ES_H + +// scal21es + +#define bli_cscal21es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal21es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal21es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal21es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h new file mode 100644 index 000000000..b10004f61 --- /dev/null +++ b/frame/include/level0/1e/bli_scal2j1es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1ES_H +#define BLIS_SCAL2J1ES_H + +// scal2j1es + +#define bli_cscal2j1es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal2j1es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal2j1es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal2j1es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1m/bli_invert1ms_mxn_diag.h b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h new file mode 100644 index 000000000..7abf891ef --- /dev/null +++ b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1MS_MXN_DIAG_H +#define BLIS_INVERT1MS_MXN_DIAG_H + +// invert1ms_mxn_diag + +#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal1ms_mxn.h b/frame/include/level0/1m/bli_scal1ms_mxn.h new file mode 100644 index 000000000..a0a9c595f --- /dev/null +++ b/frame/include/level0/1m/bli_scal1ms_mxn.h @@ -0,0 +1,124 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1MS_MXN_H +#define BLIS_SCAL1MS_MXN_H + +// scal1ms_mxn + +#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, + which steps in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_diag.h b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h new file mode 100644 index 000000000..a8975f731 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_DIAG_H +#define BLIS_SCAL21MS_MXN_DIAG_H + +// scal21ms_mxn_diag + +#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h new file mode 100644 index 000000000..ccd5d4ef8 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h @@ -0,0 +1,296 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_UPLO_H +#define BLIS_SCAL21MS_MXN_UPLO_H + +// scal21ms_mxn_uplo + +#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h new file mode 100644 index 000000000..9f701c919 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn.h @@ -0,0 +1,164 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_H +#define BLIS_SET1MS_MXN_H + +// set1ms_mxn + +#define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_diag.h b/frame/include/level0/1m/bli_set1ms_mxn_diag.h new file mode 100644 index 000000000..63262dd18 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_diag.h @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_DIAG_H +#define BLIS_SET1MS_MXN_DIAG_H + +// set1ms_mxn_diag + +#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_uplo.h b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h new file mode 100644 index 000000000..e89f9a34d --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h @@ -0,0 +1,198 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_UPLO_H +#define BLIS_SET1MS_MXN_UPLO_H + +// set1ms_mxn_uplo + +#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + scomplex* restrict y_ri = y0; \ + scomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ + float* restrict y_r = y0; \ + float* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + dcomplex* restrict y_ri = y0; \ + dcomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ + double* restrict y_r = y0; \ + double* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_seti01ms_mxn_diag.h b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h new file mode 100644 index 000000000..39be51ca5 --- /dev/null +++ b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETI01MS_MXN_DIAG_H +#define BLIS_SETI01MS_MXN_DIAG_H + +// seti01ms_mxn_diag + +#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1r/bli_copy1rs.h b/frame/include/level0/1r/bli_copy1rs.h new file mode 100644 index 000000000..d60cf9d86 --- /dev/null +++ b/frame/include/level0/1r/bli_copy1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1RS_H +#define BLIS_COPY1RS_H + +// copy1rs + +#define bli_ccopy1rs( a, br, bi ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopy1rs( a, br, bi ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_copyj1rs.h b/frame/include/level0/1r/bli_copyj1rs.h new file mode 100644 index 000000000..8cc82f558 --- /dev/null +++ b/frame/include/level0/1r/bli_copyj1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1RS_H +#define BLIS_COPYJ1RS_H + +// copyj1rs + +#define bli_ccopyj1rs( a, br, bi ) \ +{ \ + bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopyj1rs( a, br, bi ) \ +{ \ + bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_invert1rs.h b/frame/include/level0/1r/bli_invert1rs.h new file mode 100644 index 000000000..3b3a6950c --- /dev/null +++ b/frame/include/level0/1r/bli_invert1rs.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1RS_H +#define BLIS_INVERT1RS_H + +// invert1rs + +#define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) +#define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) + +#endif diff --git a/frame/include/level0/1r/bli_scal1rs.h b/frame/include/level0/1r/bli_scal1rs.h new file mode 100644 index 000000000..ec65ab664 --- /dev/null +++ b/frame/include/level0/1r/bli_scal1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1RS_H +#define BLIS_SCAL1RS_H + +// scal1rs + +#define bli_cscal1rs( a, yr, yi ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ +} + +#define bli_zscal1rs( a, yr, yi ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ +} + +#define bli_scscal1rs( a, yr, yi ) \ +{ \ + bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ +} + +#define bli_dzscal1rs( a, yr, yi ) \ +{ \ + bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal21rs.h b/frame/include/level0/1r/bli_scal21rs.h new file mode 100644 index 000000000..44d4f083f --- /dev/null +++ b/frame/include/level0/1r/bli_scal21rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21RS_H +#define BLIS_SCAL21RS_H + +// scal21rs + +#define bli_cscal21rs( a, x, yr, yi ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal21rs( a, x, yr, yi ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal21rs( a, x, yr, yi ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal21rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal2j1rs.h b/frame/include/level0/1r/bli_scal2j1rs.h new file mode 100644 index 000000000..6a356932f --- /dev/null +++ b/frame/include/level0/1r/bli_scal2j1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1RS_H +#define BLIS_SCAL2J1RS_H + +// scal2j1rs + +#define bli_cscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_scscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/ind/bli_ind.c b/frame/ind/bli_ind.c index e715b2aad..f0aec685b 100644 --- a/frame/ind/bli_ind.c +++ b/frame/ind/bli_ind.c @@ -45,6 +45,7 @@ static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = /* 4mh */ "4mh", /* 4m1b */ "4m1b", /* 4m1a */ "4m1a", +/* 1m */ "1m", /* nat */ "native", }; @@ -56,10 +57,12 @@ void bli_ind_init( void ) if ( bli_ind_is_initialized() ) return; #ifdef BLIS_ENABLE_INDUCED_SCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX ); #endif #ifdef BLIS_ENABLE_INDUCED_DCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX ); #endif // Mark API as initialized. diff --git a/frame/ind/bli_ind.h b/frame/ind/bli_ind.h index b34941d91..e0ceb383b 100644 --- a/frame/ind/bli_ind.h +++ b/frame/ind/bli_ind.h @@ -44,6 +44,9 @@ // level-3 typed APIs #include "bli_l3_ind_tapi.h" +// level-3 misc. optimizations +#include "bli_l3_ind_opt.h" + // level-3 cntx initialization #include "bli_gemmind_cntx.h" #include "bli_trsmind_cntx.h" diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index e2d1a0f86..e694f5384 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -51,6 +51,8 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = NULL, NULL, NULL, NULL, NULL }, /* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1, bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, +/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m, + bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m }, /* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat, bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, }; @@ -76,6 +78,8 @@ static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, +/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, + {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, }; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index a484cf1a1..ce40bb105 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -34,23 +34,35 @@ #include "blis.h" -typedef void (*cntx_ft)( cntx_t* cntx ); +typedef void (*cntx_init_ft)( num_t dt, cntx_t* cntx ); +typedef void (*cntx_finalize_ft)( cntx_t* cntx ); -static void* bli_gemmind_cntx_fp[BLIS_NUM_IND_METHODS][2] = +static void* bli_gemmind_cntx_init_fp[BLIS_NUM_IND_METHODS] = { - /* _cntx_init _cntx_finalize */ -/* 3mh */ { bli_gemm3mh_cntx_init, bli_gemm3mh_cntx_finalize }, -/* 3m3 */ { bli_gemm3m3_cntx_init, bli_gemm3m3_cntx_finalize }, -/* 3m2 */ { bli_gemm3m2_cntx_init, bli_gemm3m2_cntx_finalize }, -/* 3m1 */ { bli_gemm3m1_cntx_init, bli_gemm3m1_cntx_finalize }, -/* 4mh */ { bli_gemm4mh_cntx_init, bli_gemm4mh_cntx_finalize }, -/* 4mb */ { bli_gemm4mb_cntx_init, bli_gemm4mb_cntx_finalize }, -/* 4m1 */ { bli_gemm4m1_cntx_init, bli_gemm4m1_cntx_finalize }, -/* nat */ { bli_gemmnat_cntx_init, bli_gemmnat_cntx_finalize } +/* 3mh */ bli_gemm3mh_cntx_init, +/* 3m3 */ bli_gemm3m3_cntx_init, +/* 3m2 */ bli_gemm3m2_cntx_init, +/* 3m1 */ bli_gemm3m1_cntx_init, +/* 4mh */ bli_gemm4mh_cntx_init, +/* 4mb */ bli_gemm4mb_cntx_init, +/* 4m1 */ bli_gemm4m1_cntx_init, +/* 1m */ bli_gemm1m_cntx_init, +/* nat */ bli_gemmnat_cntx_init +}; + +static void* bli_gemmind_cntx_finalize_fp[BLIS_NUM_IND_METHODS] = +{ +/* 3mh */ bli_gemm3mh_cntx_finalize, +/* 3m3 */ bli_gemm3m3_cntx_finalize, +/* 3m2 */ bli_gemm3m2_cntx_finalize, +/* 3m1 */ bli_gemm3m1_cntx_finalize, +/* 4mh */ bli_gemm4mh_cntx_finalize, +/* 4mb */ bli_gemm4mb_cntx_finalize, +/* 4m1 */ bli_gemm4m1_cntx_finalize, +/* 1m */ bli_gemm1m_cntx_finalize, +/* nat */ bli_gemmnat_cntx_finalize }; -#define BLIS_CNTX_INIT_INDEX 0 -#define BLIS_CNTX_FINALIZE_INDEX 1 // ----------------------------------------------------------------------------- @@ -62,7 +74,7 @@ void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ) { ind_t method = bli_ind_oper_find_avail( BLIS_GEMM, dt ); - bli_gemmind_cntx_init( method, cntx ); + bli_gemmind_cntx_init( method, dt, cntx ); } void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) @@ -77,16 +89,16 @@ void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) // Execute the context initialization/finalization function associated // with a given induced method. -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ) +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_init_get_func( method ); + cntx_init_ft func = bli_gemmind_cntx_init_get_func( method ); - func( cntx ); + func( dt, cntx ); } void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_finalize_get_func( method ); + cntx_finalize_ft func = bli_gemmind_cntx_finalize_get_func( method ); func( cntx ); } @@ -95,17 +107,17 @@ void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) void* bli_gemmind_cntx_init_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_INIT_INDEX ]; + return bli_gemmind_cntx_init_fp[ method ]; } void* bli_gemmind_cntx_finalize_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_FINALIZE_INDEX ]; + return bli_gemmind_cntx_finalize_fp[ method ]; } // ----------------------------------------------------------------------------- -void bli_gemm3m1_cntx_init( cntx_t* cntx ) +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -122,18 +134,21 @@ void bli_gemm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, @@ -151,7 +166,7 @@ void bli_gemm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m2_cntx_init( cntx_t* cntx ) +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M2; @@ -168,18 +183,21 @@ void bli_gemm3m2_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 3.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 3.0, 3.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, @@ -197,7 +215,7 @@ void bli_gemm3m2_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m3_cntx_init( cntx_t* cntx ) +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M3; @@ -214,18 +232,21 @@ void bli_gemm3m3_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -256,7 +277,7 @@ void bli_gemm3m3_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3mh_cntx_init( cntx_t* cntx ) +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3MH; @@ -273,18 +294,21 @@ void bli_gemm3mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -318,7 +342,7 @@ void bli_gemm3mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4m1_cntx_init( cntx_t* cntx ) +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -335,18 +359,21 @@ void bli_gemm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -364,7 +391,7 @@ void bli_gemm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mb_cntx_init( cntx_t* cntx ) +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1B; @@ -381,18 +408,21 @@ void bli_gemm4mb_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 2.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 2.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 2.0, 2.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -410,7 +440,7 @@ void bli_gemm4mb_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mh_cntx_init( cntx_t* cntx ) +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4MH; @@ -427,18 +457,21 @@ void bli_gemm4mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -477,9 +510,82 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemmnat_cntx_init( cntx_t* cntx ) +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_gemm_cntx_init( cntx ); + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernel associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); + } +} + +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_gemm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm_cntx_init( dt, cntx ); } void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index c70da7b36..f49744c3f 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -32,67 +32,48 @@ */ -#if 0 -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on gemm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_stage)( dim_t stage, cntx_t* cntx ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( gemm, nat ) -GENPROT( gemm, 3mh ) -GENPROT( gemm, 3m3 ) -GENPROT( gemm, 3m2 ) -GENPROT( gemm, 3m1 ) -GENPROT( gemm, 4mh ) -GENPROT( gemm, 4mb ) -GENPROT( gemm, 4m1 ) -#endif - -void bli_gemmnat_cntx_init( cntx_t* cntx ); +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemmnat_cntx_finalize( cntx_t* cntx ); -void bli_gemm3mh_cntx_init( cntx_t* cntx ); +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m3_cntx_init( cntx_t* cntx ); +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m3_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m2_cntx_init( cntx_t* cntx ); +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m2_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m1_cntx_init( cntx_t* cntx ); +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m1_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mh_cntx_init( cntx_t* cntx ); +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mb_cntx_init( cntx_t* cntx ); +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mb_cntx_finalize( cntx_t* cntx ); -void bli_gemm4m1_cntx_init( cntx_t* cntx ); +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); +void bli_gemm1m_cntx_finalize( cntx_t* cntx ); + // ----------------------------------------------------------------------------- void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ); -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ); +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ); void* bli_gemmind_cntx_init_get_func( ind_t method ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 85212ba90..4cb0bf6ba 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -36,7 +36,7 @@ // ----------------------------------------------------------------------------- -void bli_trsm3m1_cntx_init( cntx_t* cntx ) +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -57,18 +57,21 @@ void bli_trsm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for native execution. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, @@ -82,7 +85,7 @@ void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm4m1_cntx_init( cntx_t* cntx ) +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -103,18 +106,21 @@ void bli_trsm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for native execution. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -128,9 +134,86 @@ void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsmnat_cntx_init( cntx_t* cntx ) +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_trsm_cntx_init( cntx ); + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernels associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_U_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); + } +} + +void bli_trsm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_trsm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_trsm_cntx_init( dt, cntx ); } void bli_trsmnat_cntx_finalize( cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_trsmind_cntx.h b/frame/ind/cntx/bli_trsmind_cntx.h index 3d3c883f9..49f7f0600 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.h +++ b/frame/ind/cntx/bli_trsmind_cntx.h @@ -32,29 +32,15 @@ */ -/* -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on trsm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( trsm, nat ) -GENPROT( trsm, 3m1 ) -GENPROT( trsm, 4m1 ) -*/ - -void bli_trsmnat_cntx_init( cntx_t* cntx ); +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsmnat_cntx_finalize( cntx_t* cntx ); -void bli_trsm4m1_cntx_init( cntx_t* cntx ); +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm4m1_cntx_finalize( cntx_t* cntx ); -void bli_trsm3m1_cntx_init( cntx_t* cntx ); +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm3m1_cntx_finalize( cntx_t* cntx ); +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_trsm1m_cntx_finalize( cntx_t* cntx ); + diff --git a/frame/ind/include/bli_kernel_1m_macro_defs.h b/frame/ind/include/bli_kernel_1m_macro_defs.h new file mode 100644 index 000000000..4fc0ccb06 --- /dev/null +++ b/frame/ind/include/bli_kernel_1m_macro_defs.h @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1M_MACRO_DEFS_H +#define BLIS_KERNEL_1M_MACRO_DEFS_H + + +// -- Define row access bools -------------------------------------------------- + +// gemm4m1 micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + +// -- Define default 4m1-specific kernel names --------------------------------- + +// +// Level-3 +// + +// gemm4m1 micro-kernels + +#ifndef BLIS_CGEMM1M_UKERNEL +#define BLIS_CGEMM1M_UKERNEL BLIS_CGEMM1M_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM1M_UKERNEL +#define BLIS_ZGEMM1M_UKERNEL BLIS_ZGEMM1M_UKERNEL_REF +#endif + +// gemmtrsm4m1_l micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_L_UKERNEL +#define BLIS_CGEMMTRSM1M_L_UKERNEL BLIS_CGEMMTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_L_UKERNEL +#define BLIS_ZGEMMTRSM1M_L_UKERNEL BLIS_ZGEMMTRSM1M_L_UKERNEL_REF +#endif + +// gemmtrsm4m1_u micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_U_UKERNEL +#define BLIS_CGEMMTRSM1M_U_UKERNEL BLIS_CGEMMTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_U_UKERNEL +#define BLIS_ZGEMMTRSM1M_U_UKERNEL BLIS_ZGEMMTRSM1M_U_UKERNEL_REF +#endif + +// trsm4m1_l micro-kernels + +#ifndef BLIS_CTRSM1M_L_UKERNEL +#define BLIS_CTRSM1M_L_UKERNEL BLIS_CTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_L_UKERNEL +#define BLIS_ZTRSM1M_L_UKERNEL BLIS_ZTRSM1M_L_UKERNEL_REF +#endif + +// trsm4m1_u micro-kernels + +#ifndef BLIS_CTRSM1M_U_UKERNEL +#define BLIS_CTRSM1M_U_UKERNEL BLIS_CTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_U_UKERNEL +#define BLIS_ZTRSM1M_U_UKERNEL BLIS_ZTRSM1M_U_UKERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_kernel_ind_macro_defs.h b/frame/ind/include/bli_kernel_ind_macro_defs.h index 7f43857f0..55eeb010b 100644 --- a/frame/ind/include/bli_kernel_ind_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_macro_defs.h @@ -41,9 +41,11 @@ #include "bli_kernel_4mh_macro_defs.h" #include "bli_kernel_4mb_macro_defs.h" #include "bli_kernel_4m1_macro_defs.h" +#include "bli_kernel_1m_macro_defs.h" // Storage format headers #include "bli_packm_3mis_macro_defs.h" #include "bli_packm_4mi_macro_defs.h" #include "bli_packm_rih_macro_defs.h" +#include "bli_packm_1er_macro_defs.h" diff --git a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h index b6020489e..47fbb4a28 100644 --- a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h @@ -140,6 +140,35 @@ #define BLIS_CTRSM4M1_U_UKERNEL_REF bli_ctrsm4m1_u_ukr_ref #define BLIS_ZTRSM4M1_U_UKERNEL_REF bli_ztrsm4m1_u_ukr_ref +// +// Level-3 1m +// + +// gemm1m micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_REF bli_cgemm1m_ukr_ref +#define BLIS_ZGEMM1M_UKERNEL_REF bli_zgemm1m_ukr_ref + +// gemmtrsm1m_l micro-kernels + +#define BLIS_CGEMMTRSM1M_L_UKERNEL_REF bli_cgemmtrsm1m_l_ukr_ref +#define BLIS_ZGEMMTRSM1M_L_UKERNEL_REF bli_zgemmtrsm1m_l_ukr_ref + +// gemmtrsm1m_u micro-kernels + +#define BLIS_CGEMMTRSM1M_U_UKERNEL_REF bli_cgemmtrsm1m_u_ukr_ref +#define BLIS_ZGEMMTRSM1M_U_UKERNEL_REF bli_zgemmtrsm1m_u_ukr_ref + +// trsm1m_l micro-kernels + +#define BLIS_CTRSM1M_L_UKERNEL_REF bli_ctrsm1m_l_ukr_ref +#define BLIS_ZTRSM1M_L_UKERNEL_REF bli_ztrsm1m_l_ukr_ref + +// trsm1m_u micro-kernels + +#define BLIS_CTRSM1M_U_UKERNEL_REF bli_ctrsm1m_u_ukr_ref +#define BLIS_ZTRSM1M_U_UKERNEL_REF bli_ztrsm1m_u_ukr_ref + #endif diff --git a/frame/ind/include/bli_packm_1er_macro_defs.h b/frame/ind/include/bli_packm_1er_macro_defs.h new file mode 100644 index 000000000..fe550d1c5 --- /dev/null +++ b/frame/ind/include/bli_packm_1er_macro_defs.h @@ -0,0 +1,241 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1ER_MACRO_DEFS_H +#define BLIS_KERNEL_1ER_MACRO_DEFS_H + + +// -- Define default 1e/1r-specific kernel names ------------------------------- + +// +// 1e +// + +// packm_2xk_1e kernels + +#ifndef BLIS_CPACKM_2XK_1E_KERNEL +#define BLIS_CPACKM_2XK_1E_KERNEL BLIS_CPACKM_2XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1E_KERNEL +#define BLIS_ZPACKM_2XK_1E_KERNEL BLIS_ZPACKM_2XK_1E_KERNEL_REF +#endif + +// packm_4xk_1e kernels + +#ifndef BLIS_CPACKM_4XK_1E_KERNEL +#define BLIS_CPACKM_4XK_1E_KERNEL BLIS_CPACKM_4XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1E_KERNEL +#define BLIS_ZPACKM_4XK_1E_KERNEL BLIS_ZPACKM_4XK_1E_KERNEL_REF +#endif + +// packm_6xk_1e kernels + +#ifndef BLIS_CPACKM_6XK_1E_KERNEL +#define BLIS_CPACKM_6XK_1E_KERNEL BLIS_CPACKM_6XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1E_KERNEL +#define BLIS_ZPACKM_6XK_1E_KERNEL BLIS_ZPACKM_6XK_1E_KERNEL_REF +#endif + +// packm_8xk_1e kernels + +#ifndef BLIS_CPACKM_8XK_1E_KERNEL +#define BLIS_CPACKM_8XK_1E_KERNEL BLIS_CPACKM_8XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1E_KERNEL +#define BLIS_ZPACKM_8XK_1E_KERNEL BLIS_ZPACKM_8XK_1E_KERNEL_REF +#endif + +// packm_10xk_1e kernels + +#ifndef BLIS_CPACKM_10XK_1E_KERNEL +#define BLIS_CPACKM_10XK_1E_KERNEL BLIS_CPACKM_10XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1E_KERNEL +#define BLIS_ZPACKM_10XK_1E_KERNEL BLIS_ZPACKM_10XK_1E_KERNEL_REF +#endif + +// packm_12xk_1e kernels + +#ifndef BLIS_CPACKM_12XK_1E_KERNEL +#define BLIS_CPACKM_12XK_1E_KERNEL BLIS_CPACKM_12XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1E_KERNEL +#define BLIS_ZPACKM_12XK_1E_KERNEL BLIS_ZPACKM_12XK_1E_KERNEL_REF +#endif + +// packm_14xk_1e kernels + +#ifndef BLIS_CPACKM_14XK_1E_KERNEL +#define BLIS_CPACKM_14XK_1E_KERNEL BLIS_CPACKM_14XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1E_KERNEL +#define BLIS_ZPACKM_14XK_1E_KERNEL BLIS_ZPACKM_14XK_1E_KERNEL_REF +#endif + +// packm_16xk_1e kernels + +#ifndef BLIS_CPACKM_16XK_1E_KERNEL +#define BLIS_CPACKM_16XK_1E_KERNEL BLIS_CPACKM_16XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1E_KERNEL +#define BLIS_ZPACKM_16XK_1E_KERNEL BLIS_ZPACKM_16XK_1E_KERNEL_REF +#endif + +// packm_30xk_1e kernels + +#ifndef BLIS_CPACKM_30XK_1E_KERNEL +#define BLIS_CPACKM_30XK_1E_KERNEL BLIS_CPACKM_30XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1E_KERNEL +#define BLIS_ZPACKM_30XK_1E_KERNEL BLIS_ZPACKM_30XK_1E_KERNEL_REF +#endif + +// +// 1r +// + +// packm_2xk_1r kernels + +#ifndef BLIS_CPACKM_2XK_1R_KERNEL +#define BLIS_CPACKM_2XK_1R_KERNEL BLIS_CPACKM_2XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1R_KERNEL +#define BLIS_ZPACKM_2XK_1R_KERNEL BLIS_ZPACKM_2XK_1R_KERNEL_REF +#endif + +// packm_3xk_1r kernels + +#ifndef BLIS_CPACKM_3XK_1R_KERNEL +#define BLIS_CPACKM_3XK_1R_KERNEL BLIS_CPACKM_3XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_3XK_1R_KERNEL +#define BLIS_ZPACKM_3XK_1R_KERNEL BLIS_ZPACKM_3XK_1R_KERNEL_REF +#endif + +// packm_4xk_1r kernels + +#ifndef BLIS_CPACKM_4XK_1R_KERNEL +#define BLIS_CPACKM_4XK_1R_KERNEL BLIS_CPACKM_4XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1R_KERNEL +#define BLIS_ZPACKM_4XK_1R_KERNEL BLIS_ZPACKM_4XK_1R_KERNEL_REF +#endif + +// packm_6xk_1r kernels + +#ifndef BLIS_CPACKM_6XK_1R_KERNEL +#define BLIS_CPACKM_6XK_1R_KERNEL BLIS_CPACKM_6XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1R_KERNEL +#define BLIS_ZPACKM_6XK_1R_KERNEL BLIS_ZPACKM_6XK_1R_KERNEL_REF +#endif + +// packm_8xk_1r kernels + +#ifndef BLIS_CPACKM_8XK_1R_KERNEL +#define BLIS_CPACKM_8XK_1R_KERNEL BLIS_CPACKM_8XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1R_KERNEL +#define BLIS_ZPACKM_8XK_1R_KERNEL BLIS_ZPACKM_8XK_1R_KERNEL_REF +#endif + +// packm_10xk_1r kernels + +#ifndef BLIS_CPACKM_10XK_1R_KERNEL +#define BLIS_CPACKM_10XK_1R_KERNEL BLIS_CPACKM_10XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1R_KERNEL +#define BLIS_ZPACKM_10XK_1R_KERNEL BLIS_ZPACKM_10XK_1R_KERNEL_REF +#endif + +// packm_12xk_1r kernels + +#ifndef BLIS_CPACKM_12XK_1R_KERNEL +#define BLIS_CPACKM_12XK_1R_KERNEL BLIS_CPACKM_12XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1R_KERNEL +#define BLIS_ZPACKM_12XK_1R_KERNEL BLIS_ZPACKM_12XK_1R_KERNEL_REF +#endif + +// packm_14xk_1r kernels + +#ifndef BLIS_CPACKM_14XK_1R_KERNEL +#define BLIS_CPACKM_14XK_1R_KERNEL BLIS_CPACKM_14XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1R_KERNEL +#define BLIS_ZPACKM_14XK_1R_KERNEL BLIS_ZPACKM_14XK_1R_KERNEL_REF +#endif + +// packm_16xk_1r kernels + +#ifndef BLIS_CPACKM_16XK_1R_KERNEL +#define BLIS_CPACKM_16XK_1R_KERNEL BLIS_CPACKM_16XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1R_KERNEL +#define BLIS_ZPACKM_16XK_1R_KERNEL BLIS_ZPACKM_16XK_1R_KERNEL_REF +#endif + +// packm_30xk_1r kernels + +#ifndef BLIS_CPACKM_30XK_1R_KERNEL +#define BLIS_CPACKM_30XK_1R_KERNEL BLIS_CPACKM_30XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1R_KERNEL +#define BLIS_ZPACKM_30XK_1R_KERNEL BLIS_ZPACKM_30XK_1R_KERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_packm_3mis_macro_defs.h b/frame/ind/include/bli_packm_3mis_macro_defs.h index 3abe40218..654172467 100644 --- a/frame/ind/include/bli_packm_3mis_macro_defs.h +++ b/frame/ind/include/bli_packm_3mis_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 3mis-specific kernel names -------------------------------- -// -// Level-1m -// // packm_2xk_3mis kernels diff --git a/frame/ind/include/bli_packm_4mi_macro_defs.h b/frame/ind/include/bli_packm_4mi_macro_defs.h index 2f36de349..f5a617737 100644 --- a/frame/ind/include/bli_packm_4mi_macro_defs.h +++ b/frame/ind/include/bli_packm_4mi_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 4mi-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_4mi kernels diff --git a/frame/ind/include/bli_packm_ind_pre_macro_defs.h b/frame/ind/include/bli_packm_ind_pre_macro_defs.h index ee5070e49..1bec1c5fd 100644 --- a/frame/ind/include/bli_packm_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_packm_ind_pre_macro_defs.h @@ -177,5 +177,102 @@ +// packm_2xk_1e kernels + +#define BLIS_CPACKM_2XK_1E_KERNEL_REF bli_cpackm_2xk_1e_ref +#define BLIS_ZPACKM_2XK_1E_KERNEL_REF bli_zpackm_2xk_1e_ref + +// packm_4xk_1e kernels + +#define BLIS_CPACKM_4XK_1E_KERNEL_REF bli_cpackm_4xk_1e_ref +#define BLIS_ZPACKM_4XK_1E_KERNEL_REF bli_zpackm_4xk_1e_ref + +// packm_6xk_1e kernels + +#define BLIS_CPACKM_6XK_1E_KERNEL_REF bli_cpackm_6xk_1e_ref +#define BLIS_ZPACKM_6XK_1E_KERNEL_REF bli_zpackm_6xk_1e_ref + +// packm_8xk_1e kernels + +#define BLIS_CPACKM_8XK_1E_KERNEL_REF bli_cpackm_8xk_1e_ref +#define BLIS_ZPACKM_8XK_1E_KERNEL_REF bli_zpackm_8xk_1e_ref + +// packm_10xk_1e kernels + +#define BLIS_CPACKM_10XK_1E_KERNEL_REF bli_cpackm_10xk_1e_ref +#define BLIS_ZPACKM_10XK_1E_KERNEL_REF bli_zpackm_10xk_1e_ref + +// packm_12xk_1e kernels + +#define BLIS_CPACKM_12XK_1E_KERNEL_REF bli_cpackm_12xk_1e_ref +#define BLIS_ZPACKM_12XK_1E_KERNEL_REF bli_zpackm_12xk_1e_ref + +// packm_14xk_1e kernels + +#define BLIS_CPACKM_14XK_1E_KERNEL_REF bli_cpackm_14xk_1e_ref +#define BLIS_ZPACKM_14XK_1E_KERNEL_REF bli_zpackm_14xk_1e_ref + +// packm_16xk_1e kernels + +#define BLIS_CPACKM_16XK_1E_KERNEL_REF bli_cpackm_16xk_1e_ref +#define BLIS_ZPACKM_16XK_1E_KERNEL_REF bli_zpackm_16xk_1e_ref + +// packm_30xk_1e kernels + +#define BLIS_CPACKM_30XK_1E_KERNEL_REF bli_cpackm_30xk_1e_ref +#define BLIS_ZPACKM_30XK_1E_KERNEL_REF bli_zpackm_30xk_1e_ref + +// packm_2xk_1r kernels + +#define BLIS_CPACKM_2XK_1R_KERNEL_REF bli_cpackm_2xk_1r_ref +#define BLIS_ZPACKM_2XK_1R_KERNEL_REF bli_zpackm_2xk_1r_ref + +// packm_3xk_1r kernels + +#define BLIS_CPACKM_3XK_1R_KERNEL_REF bli_cpackm_3xk_1r_ref +#define BLIS_ZPACKM_3XK_1R_KERNEL_REF bli_zpackm_3xk_1r_ref + +// packm_4xk_1r kernels + +#define BLIS_CPACKM_4XK_1R_KERNEL_REF bli_cpackm_4xk_1r_ref +#define BLIS_ZPACKM_4XK_1R_KERNEL_REF bli_zpackm_4xk_1r_ref + +// packm_6xk_1r kernels + +#define BLIS_CPACKM_6XK_1R_KERNEL_REF bli_cpackm_6xk_1r_ref +#define BLIS_ZPACKM_6XK_1R_KERNEL_REF bli_zpackm_6xk_1r_ref + +// packm_8xk_1r kernels + +#define BLIS_CPACKM_8XK_1R_KERNEL_REF bli_cpackm_8xk_1r_ref +#define BLIS_ZPACKM_8XK_1R_KERNEL_REF bli_zpackm_8xk_1r_ref + +// packm_10xk_1r kernels + +#define BLIS_CPACKM_10XK_1R_KERNEL_REF bli_cpackm_10xk_1r_ref +#define BLIS_ZPACKM_10XK_1R_KERNEL_REF bli_zpackm_10xk_1r_ref + +// packm_12xk_1r kernels + +#define BLIS_CPACKM_12XK_1R_KERNEL_REF bli_cpackm_12xk_1r_ref +#define BLIS_ZPACKM_12XK_1R_KERNEL_REF bli_zpackm_12xk_1r_ref + +// packm_14xk_1r kernels + +#define BLIS_CPACKM_14XK_1R_KERNEL_REF bli_cpackm_14xk_1r_ref +#define BLIS_ZPACKM_14XK_1R_KERNEL_REF bli_zpackm_14xk_1r_ref + +// packm_16xk_1r kernels + +#define BLIS_CPACKM_16XK_1R_KERNEL_REF bli_cpackm_16xk_1r_ref +#define BLIS_ZPACKM_16XK_1R_KERNEL_REF bli_zpackm_16xk_1r_ref + +// packm_30xk_1r kernels + +#define BLIS_CPACKM_30XK_1R_KERNEL_REF bli_cpackm_30xk_1r_ref +#define BLIS_ZPACKM_30XK_1R_KERNEL_REF bli_zpackm_30xk_1r_ref + + + #endif diff --git a/frame/ind/include/bli_packm_rih_macro_defs.h b/frame/ind/include/bli_packm_rih_macro_defs.h index 543d197a0..c5c883e7d 100644 --- a/frame/ind/include/bli_packm_rih_macro_defs.h +++ b/frame/ind/include/bli_packm_rih_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default rih-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_rih kernels diff --git a/frame/ind/misc/bli_l3_ind_opt.h b/frame/ind/misc/bli_l3_ind_opt.h new file mode 100644 index 000000000..6a0be1885 --- /dev/null +++ b/frame/ind/misc/bli_l3_ind_opt.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_IND_OPT_H +#define BLIS_L3_IND_OPT_H + +#define bli_l3_ind_recast_1m_params( dt_exec, schema_a, c, \ + m, n, k, \ + pd_a, ps_a, \ + pd_b, ps_b, \ + rs_c, cs_c ) \ +{ \ + obj_t beta; \ +\ + /* Detach the beta scalar from c so that we can test its imaginary + component. */ \ + bli_obj_scalar_detach( c, &beta ); \ +\ + /* If beta is in the real domain, and c is row- or column-stored, + then we may proceed with the optimization. */ \ + if ( bli_obj_imag_equals( &beta, &BLIS_ZERO ) && \ + !bli_is_gen_stored( rs_c, cs_c ) ) \ + { \ + dt_exec = bli_datatype_proj_to_real( dt_exec ); \ +\ + if ( bli_is_1e_packed( schema_a ) ) \ + { \ + m *= 2; \ + n *= 1; \ + k *= 2; \ + pd_a *= 2; ps_a *= 2; \ + pd_b *= 1; ps_b *= 2; \ + rs_c *= 1; cs_c *= 2; \ + } \ + else /* if ( bli_is_1r_packed( schema_a ) ) */ \ + { \ + m *= 1; \ + n *= 2; \ + k *= 2; \ + pd_a *= 1; ps_a *= 2; \ + pd_b *= 2; ps_b *= 2; \ + rs_c *= 2; cs_c *= 1; \ + } \ + } \ +} + +#endif diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c similarity index 92% rename from frame/ind/oapi/bli_l3_3m4m_oapi.c rename to frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 40348e627..cb966d71c 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -49,10 +49,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -63,7 +64,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -92,6 +93,7 @@ GENFRONT( gemm, gemm, 3m1, 1 ) GENFRONT( gemm, gemm, 4mh, 4 ) GENFRONT( gemm, gemm, 4mb, 1 ) GENFRONT( gemm, gemm, 4m1, 1 ) +GENFRONT( gemm, gemm, 1m, 1 ) // her2k GENFRONT( her2k, gemm, 3mh, 3 ) @@ -101,6 +103,7 @@ GENFRONT( her2k, gemm, 3m1, 1 ) GENFRONT( her2k, gemm, 4mh, 4 ) //GENFRONT( her2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( her2k, gemm, 4m1, 1 ) +GENFRONT( her2k, gemm, 1m, 1 ) // syr2k GENFRONT( syr2k, gemm, 3mh, 3 ) @@ -110,6 +113,7 @@ GENFRONT( syr2k, gemm, 3m1, 1 ) GENFRONT( syr2k, gemm, 4mh, 4 ) //GENFRONT( syr2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syr2k, gemm, 4m1, 1 ) +GENFRONT( syr2k, gemm, 1m, 1 ) // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -128,10 +132,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -142,7 +147,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -171,6 +176,7 @@ GENFRONT( hemm, gemm, 3m1, 1 ) GENFRONT( hemm, gemm, 4mh, 4 ) //GENFRONT( hemm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( hemm, gemm, 4m1, 1 ) +GENFRONT( hemm, gemm, 1m, 1 ) // symm GENFRONT( symm, gemm, 3mh, 3 ) @@ -180,6 +186,7 @@ GENFRONT( symm, gemm, 3m1, 1 ) GENFRONT( symm, gemm, 4mh, 4 ) //GENFRONT( symm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( symm, gemm, 4m1, 1 ) +GENFRONT( symm, gemm, 1m, 1 ) // trmm3 GENFRONT( trmm3, gemm, 3mh, 3 ) @@ -189,6 +196,7 @@ GENFRONT( trmm3, gemm, 3m1, 1 ) GENFRONT( trmm3, gemm, 4mh, 4 ) //GENFRONT( trmm3, gemm, 4mb, 1 ) // Not implemented. GENFRONT( trmm3, gemm, 4m1, 1 ) +GENFRONT( trmm3, gemm, 1m, 1 ) // -- herk/syrk ---------------------------------------------------------------- @@ -205,10 +213,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -219,7 +228,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -248,6 +257,7 @@ GENFRONT( herk, gemm, 3m1, 1 ) GENFRONT( herk, gemm, 4mh, 4 ) //GENFRONT( herk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( herk, gemm, 4m1, 1 ) +GENFRONT( herk, gemm, 1m, 1 ) // syrk GENFRONT( syrk, gemm, 3mh, 3 ) @@ -257,6 +267,7 @@ GENFRONT( syrk, gemm, 3m1, 1 ) GENFRONT( syrk, gemm, 4mh, 4 ) //GENFRONT( syrk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syrk, gemm, 4m1, 1 ) +GENFRONT( syrk, gemm, 1m, 1 ) // -- trmm --------------------------------------------------------------------- @@ -273,6 +284,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ dim_t i; \ \ @@ -285,7 +298,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -310,6 +323,7 @@ GENFRONT( trmm, gemm, 3m1, 1 ) //GENFRONT( trmm, gemm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, gemm, 4mb, 1 ) // Unimplementable. GENFRONT( trmm, gemm, 4m1, 1 ) +GENFRONT( trmm, gemm, 1m, 1 ) // -- trsm --------------------------------------------------------------------- @@ -326,6 +340,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ \ /* If the objects are in the real domain, execute the native @@ -337,7 +353,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ { \ /* NOTE: trsm cannot be implemented via any induced method that @@ -360,4 +376,5 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) +GENFRONT( trsm, trsm, 1m, 1 ) diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 62fa794fa..7f8ae194c 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -55,6 +55,7 @@ GENPROT( nat ) GENPROT( ind ) GENPROT( 3m1 ) GENPROT( 4m1 ) +GENPROT( 1m ) // diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 68b664d65..c783714fe 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -55,10 +55,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -92,10 +93,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -127,10 +129,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -161,10 +164,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -194,10 +198,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ diff --git a/frame/ind/tapi/bli_l3_ind_tapi.c b/frame/ind/tapi/bli_l3_ind_tapi.c index 1c4ba3ba9..d4425b5f6 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.c +++ b/frame/ind/tapi/bli_l3_ind_tapi.c @@ -90,6 +90,7 @@ INSERT_GENTFUNC_BASIC0( gemm3m1 ) INSERT_GENTFUNC_BASIC0( gemm4mh ) INSERT_GENTFUNC_BASIC0( gemm4mb ) INSERT_GENTFUNC_BASIC0( gemm4m1 ) +INSERT_GENTFUNC_BASIC0( gemm1m ) // -- hemm --------------------------------------------------------------------- @@ -149,6 +150,7 @@ INSERT_GENTFUNC_BASIC0( hemm3mh ) INSERT_GENTFUNC_BASIC0( hemm3m1 ) INSERT_GENTFUNC_BASIC0( hemm4mh ) INSERT_GENTFUNC_BASIC0( hemm4m1 ) +INSERT_GENTFUNC_BASIC0( hemm1m ) // -- herk --------------------------------------------------------------------- @@ -200,6 +202,7 @@ INSERT_GENTFUNCR_BASIC0( herk3mh ) INSERT_GENTFUNCR_BASIC0( herk3m1 ) INSERT_GENTFUNCR_BASIC0( herk4mh ) INSERT_GENTFUNCR_BASIC0( herk4m1 ) +INSERT_GENTFUNCR_BASIC0( herk1m ) // -- her2k -------------------------------------------------------------------- @@ -258,6 +261,7 @@ INSERT_GENTFUNCR_BASIC0( her2k3mh ) INSERT_GENTFUNCR_BASIC0( her2k3m1 ) INSERT_GENTFUNCR_BASIC0( her2k4mh ) INSERT_GENTFUNCR_BASIC0( her2k4m1 ) +INSERT_GENTFUNCR_BASIC0( her2k1m ) // -- symm --------------------------------------------------------------------- @@ -317,6 +321,7 @@ INSERT_GENTFUNC_BASIC0( symm3mh ) INSERT_GENTFUNC_BASIC0( symm3m1 ) INSERT_GENTFUNC_BASIC0( symm4mh ) INSERT_GENTFUNC_BASIC0( symm4m1 ) +INSERT_GENTFUNC_BASIC0( symm1m ) // -- syrk --------------------------------------------------------------------- @@ -367,6 +372,7 @@ INSERT_GENTFUNC_BASIC0( syrk3mh ) INSERT_GENTFUNC_BASIC0( syrk3m1 ) INSERT_GENTFUNC_BASIC0( syrk4mh ) INSERT_GENTFUNC_BASIC0( syrk4m1 ) +INSERT_GENTFUNC_BASIC0( syrk1m ) // -- syr2k -------------------------------------------------------------------- @@ -424,6 +430,7 @@ INSERT_GENTFUNC_BASIC0( syr2k3mh ) INSERT_GENTFUNC_BASIC0( syr2k3m1 ) INSERT_GENTFUNC_BASIC0( syr2k4mh ) INSERT_GENTFUNC_BASIC0( syr2k4m1 ) +INSERT_GENTFUNC_BASIC0( syr2k1m ) // -- trmm3 -------------------------------------------------------------------- @@ -485,6 +492,7 @@ INSERT_GENTFUNC_BASIC0( trmm33mh ) INSERT_GENTFUNC_BASIC0( trmm33m1 ) INSERT_GENTFUNC_BASIC0( trmm34mh ) INSERT_GENTFUNC_BASIC0( trmm34m1 ) +INSERT_GENTFUNC_BASIC0( trmm31m ) // -- trmm --------------------------------------------------------------------- @@ -534,6 +542,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trmm3m1 ) INSERT_GENTFUNC_BASIC0( trmm4m1 ) +INSERT_GENTFUNC_BASIC0( trmm1m ) // -- trsm --------------------------------------------------------------------- @@ -583,4 +592,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trsm3m1 ) INSERT_GENTFUNC_BASIC0( trsm4m1 ) +INSERT_GENTFUNC_BASIC0( trsm1m ) diff --git a/frame/ind/tapi/bli_l3_ind_tapi.h b/frame/ind/tapi/bli_l3_ind_tapi.h index 029166c6c..7aa886b3d 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.h +++ b/frame/ind/tapi/bli_l3_ind_tapi.h @@ -58,6 +58,7 @@ INSERT_GENTPROT_BASIC( gemm3m1 ) INSERT_GENTPROT_BASIC( gemm4mh ) INSERT_GENTPROT_BASIC( gemm4mb ) INSERT_GENTPROT_BASIC( gemm4m1 ) +INSERT_GENTPROT_BASIC( gemm1m ) #undef GENTPROT @@ -83,6 +84,7 @@ INSERT_GENTPROT_BASIC( hemm3mh ) INSERT_GENTPROT_BASIC( hemm3m1 ) INSERT_GENTPROT_BASIC( hemm4mh ) INSERT_GENTPROT_BASIC( hemm4m1 ) +INSERT_GENTPROT_BASIC( hemm1m ) #undef GENTPROTR @@ -107,6 +109,7 @@ INSERT_GENTPROTR_BASIC( her2k3mh ) INSERT_GENTPROTR_BASIC( her2k3m1 ) INSERT_GENTPROTR_BASIC( her2k4mh ) INSERT_GENTPROTR_BASIC( her2k4m1 ) +INSERT_GENTPROTR_BASIC( her2k1m ) #undef GENTPROTR @@ -129,6 +132,7 @@ INSERT_GENTPROTR_BASIC( herk3mh ) INSERT_GENTPROTR_BASIC( herk3m1 ) INSERT_GENTPROTR_BASIC( herk4mh ) INSERT_GENTPROTR_BASIC( herk4m1 ) +INSERT_GENTPROTR_BASIC( herk1m ) #undef GENTPROT @@ -154,6 +158,7 @@ INSERT_GENTPROT_BASIC( symm3mh ) INSERT_GENTPROT_BASIC( symm3m1 ) INSERT_GENTPROT_BASIC( symm4mh ) INSERT_GENTPROT_BASIC( symm4m1 ) +INSERT_GENTPROT_BASIC( symm1m ) #undef GENTPROT @@ -178,6 +183,7 @@ INSERT_GENTPROT_BASIC( syr2k3mh ) INSERT_GENTPROT_BASIC( syr2k3m1 ) INSERT_GENTPROT_BASIC( syr2k4mh ) INSERT_GENTPROT_BASIC( syr2k4m1 ) +INSERT_GENTPROT_BASIC( syr2k1m ) #undef GENTPROT @@ -200,6 +206,7 @@ INSERT_GENTPROT_BASIC( syrk3mh ) INSERT_GENTPROT_BASIC( syrk3m1 ) INSERT_GENTPROT_BASIC( syrk4mh ) INSERT_GENTPROT_BASIC( syrk4m1 ) +INSERT_GENTPROT_BASIC( syrk1m ) #undef GENTPROT @@ -226,6 +233,7 @@ INSERT_GENTPROT_BASIC( trmm33mh ) INSERT_GENTPROT_BASIC( trmm33m1 ) INSERT_GENTPROT_BASIC( trmm34mh ) INSERT_GENTPROT_BASIC( trmm34m1 ) +INSERT_GENTPROT_BASIC( trmm31m ) #undef GENTPROT @@ -247,6 +255,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trmm3m1 ) INSERT_GENTPROT_BASIC( trmm4m1 ) +INSERT_GENTPROT_BASIC( trmm1m ) #undef GENTPROT @@ -268,4 +277,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trsm3m1 ) INSERT_GENTPROT_BASIC( trsm4m1 ) +INSERT_GENTPROT_BASIC( trsm1m ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c new file mode 100644 index 000000000..f686aa7ac --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t row_pref = !col_pref; \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride OR if for some reason the storage of c is not the + preferred storage of the micro-kernel, then we compute the + alpha*a*b product into temporary storage and then accumulate that + result into c afterwards. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h index d7d5a258f..9b2dd5e5a 100644 --- a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h +++ b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h @@ -55,4 +55,5 @@ INSERT_GENTPROTCO_BASIC( gemm3m1_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mh_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mb_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4m1_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemm1m_ukr_ref ) diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c new file mode 100644 index 000000000..7d746304c --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -0,0 +1,244 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a1x, \ + ctype* restrict a11, \ + ctype* restrict bx1, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ +\ + PASTECH(ch,trsm_ukr_ft) \ + ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ +\ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ + const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ +\ + ctype bt[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_bt; \ + inc_t cs_bt; \ +\ + inc_t rs_bt_r; \ + inc_t cs_bt_r; \ +\ + const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype_r* restrict a1x_r = ( ctype_r* )a1x; \ +\ + ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + const ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ + const ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* b_use; \ + inc_t rs_b_use; \ + inc_t cs_b_use; \ +\ +\ + /* Handle alphas with non-zero imaginary components. */ \ + /* NOTE: This branch should never execute because alphas with + non-zero imaginary components should be applied during + packing, and so the only alphas we should see here are + those exclusively in the real domain, either because the + value originally had no imaginary compoent (e.g. 4.0) or + because a 1.0 was sent in as a placeholder since the alpha + was applied during packing. */ \ + if ( 0 ) \ + if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + { \ + bli_abort(); \ +\ +/* + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + const inc_t ld_b = rs_b; \ +\ + PASTEMAC(ch,scal1ms_mxn)( schema_b, \ + mr, \ + nr, \ + alpha, \ + b11, rs_b, cs_b, ld_b ); \ +\ + alpha_r = *one_r; \ +*/ \ + } \ +\ +\ + { \ + /* Set the strides for the temporary bt matrix based on the native + real domain micro-kernel storage preferences. */ \ + if ( col_pref ) { rs_bt = 1; cs_bt = mr; \ + rs_bt_r = 1; cs_bt_r = mr_r; } \ + else { rs_bt = nr; cs_bt = 1; \ + rs_bt_r = nr_r; cs_bt_r = 1; } \ +\ + b_use = ( ctype_r* )bt; \ + rs_b_use = rs_bt_r; \ + cs_b_use = cs_bt_r; \ + } \ +\ +\ + /* Since b11 is stored in the 1e or 1r schema, we cannot update it + directly, and instead must compute the matrix product in a local + temporary microtile and then accumulate it into b11 according to + its schema. */ \ +\ +\ + /* lower: bt = -1.0 * a10 * b01; + upper: bt = -1.0 * a12 * b21; */ \ + rgemm_ukr \ + ( \ + k2, \ + minus_one_r, \ + a1x_r, \ + bx1_r, \ + zero_r, \ + b_use, rs_b_use, cs_b_use, \ + data, \ + cntx \ + ); \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t ld_b = rs_b; \ +\ + ctype* restrict b11_ri = ( ctype* )b11; \ + ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ + ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ +\ + PASTEMAC(ch,sets)( -*beta11_i, \ + *beta11_r, *beta11_ir ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t ld_b = rs_b; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = cs_b; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ + ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + ctrsm_vir_ukr \ + ( \ + a11, \ + b11, \ + c11, rs_c, cs_c, \ + data, \ + cntx \ + ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h index 7ec51ad8d..615482e41 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h @@ -55,3 +55,6 @@ INSERT_GENTPROTCO_BASIC( gemmtrsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c new file mode 100644 index 000000000..92da659ca --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -0,0 +1,448 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ + ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ + ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ + ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ + ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a10t_ri = a_ri + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_r = b_r + (0 )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_i = b_i + (0 )*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_r = B0_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_i = B0_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha10_ri = a10t_ri + (l )*cs_a; \ + ctype_r* restrict alpha10_r = &PASTEMAC(ch,real)( *alpha10_ri ); \ + ctype_r* restrict alpha10_i = &PASTEMAC(ch,imag)( *alpha10_ri ); \ + ctype_r* restrict beta01_r = b01_r + (l )*rs_b2; \ + ctype_r* restrict beta01_i = b01_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_l_ukr_ref ) + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ + ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ + ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ + ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a12t_ri = a_ri + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_r = b_r + (i+1)*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_i = b_i + (i+1)*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_r = B2_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_i = B2_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha12_ri = a12t_ri + (l )*cs_a; \ + ctype_r* restrict alpha12_r = &PASTEMAC(ch,real)( *alpha12_ri ); \ + ctype_r* restrict alpha12_i = &PASTEMAC(ch,imag)( *alpha12_ri ); \ + ctype_r* restrict beta21_r = b21_r + (l )*rs_b2; \ + ctype_r* restrict beta21_i = b21_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h index abad11caf..77d502a3c 100644 --- a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h @@ -51,3 +51,6 @@ INSERT_GENTPROTCO_BASIC( trsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c deleted file mode 100644 index 5fc8e012c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ - ctype_r* restrict a10_ri = ( ctype_r* )a10 + 2*is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ - ctype_r* restrict b01_ri = ( ctype_r* )b01 + 2*is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.ri * b01.ri - a10.r * b01.r - a10.i * b01.i ); */ \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* ab.r = a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_r, \ - b01_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_ri, b01_ri, *data ); \ -\ - /* ab.i = a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.ri * b01.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_ri, \ - b01_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r after", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i after", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_r", k, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_i", k, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c deleted file mode 100644 index 9d82ba8c9..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ - ctype_r* restrict a12_ri = ( ctype_r* )a12 + 2*is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ - ctype_r* restrict b21_ri = ( ctype_r* )b21 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.i - ( a12.ri * b21.ri - a12.r * b21.r - a12.i * b21.i ); */ \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* ab.r = a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_r, \ - b21_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_ri, b21_ri, *data ); \ -\ - /* ab.i = a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.ri * b21.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_ri, \ - b21_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c deleted file mode 100644 index c979d5cbf..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -printf( "gemmtrsm4m1_l_ukr: is_a = %lu is_b = %lu\n", is_a, is_b ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_r", m, k+m, \ - a10_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_i", m, k+m, \ - a10_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.r * b01.i + a10.i * b01.r ); */ \ -\ - bli_auxinfo_set_next_ab( a10_r, b01_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.r * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a10.i * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_i, \ - b01_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r post-gemm", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i post-gemm", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r after", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i after", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c deleted file mode 100644 index 9d1d1927e..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_r", m, k+m, \ - a11_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_i", m, k+m, \ - a11_r+is_a, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_r", k+m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_i", k+m, n, \ - b11_r+is_b, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ - \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.r - ( a12.r * b21.i + a12.i * b21.r ); */ \ -\ - bli_auxinfo_set_next_ab( a12_r, b21_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.r * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a12.i * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_i, \ - b21_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c deleted file mode 100644 index 62fff68e0..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c deleted file mode 100644 index af916ed33..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_u_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c deleted file mode 100644 index 06274d95c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_r", m, m, \ - a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_i", m, m, \ - a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r after", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i after", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c deleted file mode 100644 index 5711dc8ce..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_u_ukr_ref ) - diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index ad2bb0b40..9cccce228 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -60,7 +60,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -96,7 +96,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, m ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -139,7 +139,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -186,7 +186,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -281,7 +281,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -319,7 +319,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -359,7 +359,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 9e982032f..0b13b8eb1 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -36,7 +36,7 @@ # Makefile # # Field G. Van Zee -# +# # Makefile for standalone BLIS test drivers. # @@ -189,6 +189,7 @@ D3M1 := -DIND=BLIS_3M1 D4MHW := -DIND=BLIS_4MH D4M1B := -DIND=BLIS_4M1B D4M1A := -DIND=BLIS_4M1A +D1M := -DIND=BLIS_1M DNAT := -DIND=BLIS_NAT # Implementation string @@ -199,6 +200,7 @@ STR_3M1 := -DSTR=\"3m1\" STR_4MHW := -DSTR=\"4mhw\" STR_4M1B := -DSTR=\"4m1b\" STR_4M1A := -DSTR=\"4m1a\" +STR_1M := -DSTR=\"1m\" STR_NAT := -DSTR=\"asm\" STR_OBL := -DSTR=\"openblas\" STR_MKL := -DSTR=\"mkl\" @@ -209,13 +211,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=80 \ - -DP_END=2000 \ - -DP_INC=80 +PDEF_ST := -DP_BEGIN=100 \ + -DP_END=1000 \ + -DP_INC=100 -PDEF_MT := -DP_BEGIN=80 \ - -DP_END=4000 \ - -DP_INC=80 +PDEF_MT := -DP_BEGIN=100 \ + -DP_END=2000 \ + -DP_INC=100 @@ -259,6 +261,8 @@ blis-gemm-st: \ test_zgemm_4m1b_blis_st.x \ test_cgemm_4m1a_blis_st.x \ test_zgemm_4m1a_blis_st.x \ + test_cgemm_1m_blis_st.x \ + test_zgemm_1m_blis_st.x \ test_cgemm_asm_blis_st.x \ test_zgemm_asm_blis_st.x @@ -280,6 +284,8 @@ blis-gemm-mt: \ test_zgemm_4m1b_blis_mt.x \ test_cgemm_4m1a_blis_mt.x \ test_zgemm_4m1a_blis_mt.x \ + test_cgemm_1m_blis_mt.x \ + test_zgemm_1m_blis_mt.x \ test_cgemm_asm_blis_mt.x \ test_zgemm_asm_blis_mt.x @@ -411,6 +417,19 @@ test_z%_4m1a_blis_mt.o: test_%.c test_c%_4m1a_blis_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_MT) -c $< -o $@ +# blis 1m +test_z%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_c%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_z%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + +test_c%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + # blis asm test_d%_asm_blis_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index bb65a5db5..794f0ba00 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -75,16 +75,16 @@ test_ops_r="${l3_ops}" if [ ${sys} = "blis" ]; then #test_impls="openblas mkl 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" - test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" elif [ ${sys} = "stampede" ]; then - test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" #test_impls="openblas mkl asm_blis" elif [ ${sys} = "wahlberg" ]; then - test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" test_impls="openblas acml asm_blis" fi diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index c8e9ec5d5..7b16f584f 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -79,23 +79,19 @@ int main( int argc, char** argv ) k_input = -1; #if 0 - num_t dt_real = bli_datatype_proj_to_real( DT ); + cntx_t cntx; - bli_gemm_cntx_init( &cntx ); + // Initialize a context for the current induced method and datatype. + bli_gemmind_cntx_init( IND, dt, &cntx ); - // Extract the kc blocksize for the requested datatype and its - // real analogue. - dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); - dim_t kc_real = bli_cntx_get_blksz_def_dt( dt_real, BLIS_KC, &cntx ); + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + +#elif 0 + + k_input = 256; - // Assign the k dimension depending on which implementation is - // being tested. Note that the BLIS_NAT case handles the real - // domain cases as well as native complex. - if ( IND == BLIS_NAT ) k_input = kc; - else if ( IND == BLIS_3M1 ) k_input = kc_real / 3; - else if ( IND == BLIS_4M1A ) k_input = kc_real / 2; - else k_input = kc_real; #endif // Choose the char corresponding to the requested datatype. @@ -154,7 +150,7 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transb, b ); bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( -(1.0/1.0), 0.0, &beta ); + bli_setsc( (1.0/1.0), 0.0, &beta ); bli_copym( &c, &c_save ); diff --git a/testsuite/input.general b/testsuite/input.general index 0bf9053bd..9dba50df6 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -9,7 +9,7 @@ # 1 # Number of repeats per experiment (best result is reported) -c # Matrix storage scheme(s) to test: +rc # Matrix storage scheme(s) to test: # 'c' = col-major storage; 'g' = general stride storage; # 'r' = row-major storage c # Vector storage scheme(s) to test: @@ -26,7 +26,7 @@ sdcz # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex 100 # Problem size: first to test -400 # Problem size: maximum to test +500 # Problem size: maximum to test 100 # Problem size: increment between experiments # Complex level-3 implementations to test 1 # 3mh ('1' = enable; '0' = disable) @@ -36,6 +36,7 @@ sdcz # Datatype(s) to test: 1 # 4mh ('1' = enable; '0' = disable) 1 # 4m1b ('1' = enable; '0' = disable) 1 # 4m1a ('1' = enable; '0' = disable) +1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 6f5515127..41c0b9160 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -168,7 +168,7 @@ void libblis_test_axpy2v_experiment cntx_t cntx; // Initialize a context. - bli_axpy2v_cntx_init( &cntx ); + bli_axpy2v_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 706359ca4..8da15c315 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -166,7 +166,7 @@ void libblis_test_axpyf_experiment cntx_t cntx; // Initialize a context. - bli_axpyf_cntx_init( &cntx ); + bli_axpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 36b88cc2f..6c1440e95 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -171,7 +171,7 @@ void libblis_test_dotaxpyv_experiment cntx_t cntx; // Initialize a context. - bli_dotaxpyv_cntx_init( &cntx ); + bli_dotaxpyv_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index dd83dc49e..a7abdba87 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -176,7 +176,7 @@ void libblis_test_dotxaxpyf_experiment cntx_t cntx; // Initialize a context. - bli_dotxaxpyf_cntx_init( &cntx ); + bli_dotxaxpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 3a29b41b7..8adec7c1d 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -168,7 +168,7 @@ void libblis_test_dotxf_experiment cntx_t cntx; // Initialize a context. - bli_dotxf_cntx_init( &cntx ); + bli_dotxf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index 222dca395..89a8bd7c3 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -259,8 +259,6 @@ void libblis_test_gemm_impl { case BLIS_TEST_SEQ_FRONT_END: bli_gemm( alpha, a, b, beta, c ); - //bli_gemm4m( alpha, a, b, beta, c ); - //bli_gemm3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 514fdf66a..f418ac6e5 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -173,7 +173,7 @@ void libblis_test_gemm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_gemm_cntx_init( &cntx ); + bli_gemm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index afd436d7f..172ff053a 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -198,7 +198,7 @@ void libblis_test_gemmtrsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index bd14d13b4..993c134b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -427,6 +427,10 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); + // Read whether to enable 1m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_1M ]) ); + // Read whether to native (complex) execution. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); @@ -597,8 +601,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) //char int_type_size_str[8]; gint_t int_type_size; ind_t im; - cntx_t cntx_s; - cntx_t* cntx = &cntx_s; + cntx_t cntx_local; + cntx_t cntx_local_c; + cntx_t cntx_local_z; + cntx_t* cntx = &cntx_local; + cntx_t* cntx_c = &cntx_local_c; + cntx_t* cntx_z = &cntx_local_z; // If bli_info_get_int_type_size() returns 32 or 64, the size is forced. // Otherwise, the size is chosen automatically. We query the result of @@ -721,7 +729,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmnat_cntx_init( cntx ); + // Initialize a context for the gemm family, assuming native execution. + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_gemmnat_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "level-3 blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d %7d %7d\n", @@ -825,42 +836,43 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_init( im, cntx ); + bli_gemmind_cntx_init( im, BLIS_SCOMPLEX, cntx_c ); + bli_gemmind_cntx_init( im, BLIS_DCOMPLEX, cntx_z ); libblis_test_fprintf_c( os, "level-3 blocksizes c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "micro-kernel types c z\n" ); libblis_test_fprintf_c( os, " gemm %7s %7s\n", @@ -880,14 +892,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_trsm_u_ukr_impl_string( im, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_finalize( im, cntx ); + bli_gemmind_cntx_finalize( im, cntx_c ); + bli_gemmind_cntx_finalize( im, cntx_z ); } bli_ind_disable_all(); // We use hemv's context because we know it is initialized with all of the fields // we will be outputing. - bli_hemv_cntx_init( cntx ); + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_hemv_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS misc. other info ---\n" ); @@ -955,6 +970,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); + libblis_test_fprintf_c( os, " 1m? %u\n", params->ind_enable[ BLIS_1M ] ); libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index bf5f2d6bd..e7ccb4b43 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -178,7 +178,7 @@ void libblis_test_trsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Fix m and n to MR and NR, respectively. m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx ); diff --git a/version b/version index 0c62199f1..566318cf2 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1 +0.2.1-82 From 1c732d3ddc4ac0861d3b0e0dd15eb7e071615502 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 25 Jan 2017 16:25:46 -0600 Subject: [PATCH 03/23] Added 1m-specific APIs for bp, pb gemm algorithms. Details: - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the body of bli_gemm_cntl_create() replaced with a call to the former. - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, bli_cntl_free() can check if the thread parameter is NULL, and if so, call the latter, and otherwise call the former. - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in terms of bli_gemm1mxx_cntx_init(), which behaves the same as bli_gemm1m_cntx_init() did before, except that an extra bool parameter (is_pb) is used to support both bp and pb algorithms (including to support the anti-preference field described below). - Added support for "anti-preference" in context. The anti_pref field, when true, will toggle the boolean return value of routines such as bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of causing BLIS to transpose the operation to achieve disagreement (rather than agreement) between the storage of C and the micro-kernel output preference. This disagreement is needed for panel-block implementations, since they induce a transposition of the suboperation immediately before the macro-kernel is called, which changes the apparent storage of C. For now, anti-preference is used only with the pb algorithm for 1m (and not with any other non-1m implementation). - Defined new functions, bli_cntx_l3_ukr_eff_prefers_storage_of() bli_cntx_l3_ukr_eff_dislikes_storage_of() bli_cntx_l3_nat_ukr_eff_prefers_storage_of() bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() which are identical to their non-"eff" (effectively) counterparts except that they take the anti-preference field of the context into account. - Explicitly initialize the anti-pref field to FALSE in bli_gks_cntx_set_l3_nat_ukr_prefs(). - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel in terms of the existing block-panel macro-kernel _ker_var2(). This technique requires inducing transposes on all operands and swapping the A and B. - Changed bli_obj_induce_trans() macro so that pack-related fields are also changed to reflect the induced transposition. - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily specify the 1m algorithm (block-panel or panel-block). - Renamed the following cntx_t-related macros: bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() and updated all instantiations. Also updated the field names in the cntx_t struct. - Comment updates. --- frame/1m/packm/bli_packm_init.c | 4 +- frame/3/bli_l3_cntl.c | 4 +- frame/3/bli_l3_cntx.c | 10 +- frame/3/gemm/bli_gemm_cntl.c | 104 +++++++++- frame/3/gemm/bli_gemm_cntl.h | 14 ++ frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_ker_var1.c | 56 ++++++ frame/3/gemm/bli_gemm_var.h | 1 + frame/base/bli_cntl.c | 52 ++++- frame/base/bli_cntl.h | 13 ++ frame/base/bli_cntx.c | 102 ++++++++-- frame/base/bli_cntx.h | 94 ++++++--- frame/base/bli_gks.c | 3 + frame/include/bli_obj_macro_defs.h | 16 ++ frame/include/bli_type_defs.h | 8 +- frame/ind/cntx/bli_gemmind_cntx.c | 133 ++++++++----- frame/ind/cntx/bli_gemmind_cntx.h | 3 + frame/ind/cntx/bli_trsmind_cntx.c | 24 +-- frame/ind/oapi/bli_l3_1mbppb_oapi.c | 85 ++++++++ frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 8 + frame/ind/oapi/bli_l3_ind_oapi.h | 14 ++ frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 130 +++++++----- .../ukernels/gemm/bli_gemm1m_ukr_ref.c.prev | 188 ++++++++++++++++++ .../ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c | 2 +- frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c | 4 +- 25 files changed, 891 insertions(+), 183 deletions(-) create mode 100644 frame/3/gemm/bli_gemm_ker_var1.c create mode 100644 frame/ind/oapi/bli_l3_1mbppb_oapi.c create mode 100644 frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index ccf88f3cb..d828f698d 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -121,11 +121,11 @@ siz_t bli_packm_init if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { - schema = bli_cntx_get_pack_schema_a( cntx ); + schema = bli_cntx_get_pack_schema_a_block( cntx ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { - schema = bli_cntx_get_pack_schema_b( cntx ); + schema = bli_cntx_get_pack_schema_b_panel( cntx ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index a8dfee1ba..4fe3fe7f5 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -70,8 +70,8 @@ void bli_l3_cntl_create_if else { // If the user provided a control tree, create a copy and use it - // instead (so that it can be used to cache things like pack mem_t - // entries). + // instead (so that threads can use its local tree as a place to + // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); } } diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 8b4b01572..161e68160 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_gemm_cntx_finalize( cntx_t* cntx ) @@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_trsm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b3494b174..775ca2544 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create opid_t family ) { - void* macro_kernel_p = bli_gemm_ker_var2; + return bli_gemmbp_cntl_create( family ); +} +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var2; // Change the macro-kernel if the operation family is herk or trmm. if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; @@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( - bli_gemm_packa, + bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, BLIS_MR, BLIS_KR, @@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create ( - bli_gemm_packb, + bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, BLIS_KR, BLIS_NR, @@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create return gemm_cntl_vl_mm; } +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var1; + + // Change the macro-kernel if the operation family is herk or trmm. + //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; + + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); + + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_ub_ke + ); + + // Create a node for packing matrix A (which is really the right-hand + // operand "B"). + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, // pack the right-hand operand + bli_packm_blk_var1, + BLIS_KR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_pb_ub + ); + + // Create a node for partitioning the n dimension by MC. + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var2, + gemm_cntl_packb + ); + + // Create a node for packing matrix B (which is really the left-hand + // operand "A"). + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, // pack the left-hand operand + bli_packm_blk_var1, + BLIS_NR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_pb + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packa + ); + + // Create a node for partitioning the m dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var1, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; +} + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5b985327c..6da6cd768 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create opid_t family ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ); + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ); + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index d3b11c43d..acceabbe8 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -73,7 +73,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c new file mode 100644 index 000000000..7b485a6b7 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var1.c @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ker_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Implement _ker_var1() in terms of _ker_var2() by transposing the + // entire suboperation (which also requires swapping A and B). + + bli_obj_induce_trans( *a ); + bli_obj_induce_trans( *b ); + bli_obj_induce_trans( *c ); + + bli_gemm_ker_var2( b, a, c, cntx, cntl, thread ); +} + diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index c66587fda..88412c3d8 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_packa ) GENPROT( gemm_packb ) +GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 2b45a5de3..cac290da9 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -97,6 +97,16 @@ void bli_cntl_free cntl_t* cntl, thrinfo_t* thread ) +{ + if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread ); + else bli_cntl_free_wo_thrinfo( cntl ); +} + +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; @@ -112,7 +122,7 @@ void bli_cntl_free { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. @@ -122,8 +132,8 @@ void bli_cntl_free } // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the current thread - // is chief for its group, and only if the mem_t is allocated. + // broker from which it originated, but only if the mem_t entry is + // allocated, and only if the current thread is chief for its group. if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { @@ -134,6 +144,42 @@ void bli_cntl_free bli_cntl_obj_free( cntl ); } +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free_wo_thrinfo( cntl_sub_node ); + } + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the mem_t entry is + // allocated. + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 7b6000bb9..fd0413f4f 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -75,12 +75,25 @@ void bli_cntl_obj_clear cntl_t* cntl ); +// ----------------------------------------------------------------------------- + void bli_cntl_free ( cntl_t* cntl, thrinfo_t* thread ); +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ); + cntl_t* bli_cntl_copy ( cntl_t* cntl diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index e4299eb49..f8cdf1fc4 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx ) return bli_cntx_method( cntx ); } -pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ) { - return bli_cntx_schema_a( cntx ); + return bli_cntx_schema_a_block( cntx ); } -pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ) { - return bli_cntx_schema_b( cntx ); + return bli_cntx_schema_b_panel( cntx ); +} + +pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ) +{ + return bli_cntx_schema_c_panel( cntx ); +} + +bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ) +{ + return bli_cntx_anti_pref( cntx ); } #endif @@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method, bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c( pack_t schema_c, +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ) +{ + bli_cntx_set_schema_c_panel( schema_c, cntx ); +} + +#if 0 +void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, cntx_t* cntx ) { - bli_cntx_set_schema_c( schema_c, cntx ); + bli_cntx_set_anti_pref( anti_pref, cntx ); } +#endif void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, dim_t m, dim_t n, dim_t k ) @@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +// ----------------------------------------------------------------------------- + bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) @@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 9c97c3312..a76cdd329 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + bool_t anti_pref; + dim_t* thrloop; membrk_t* membrk; @@ -113,26 +115,30 @@ typedef struct cntx_s \ ( (cntx)->method ) -#define bli_cntx_schema_a( cntx ) \ +#define bli_cntx_schema_a_block( cntx ) \ \ - ( (cntx)->schema_a ) + ( (cntx)->schema_a_block ) -#define bli_cntx_schema_b( cntx ) \ +#define bli_cntx_schema_b_panel( cntx ) \ \ - ( (cntx)->schema_b ) + ( (cntx)->schema_b_panel ) -#define bli_cntx_schema_c( cntx ) \ +#define bli_cntx_schema_c_panel( cntx ) \ \ - ( (cntx)->schema_c ) + ( (cntx)->schema_c_panel ) -#define bli_cntx_membrk( cntx ) \ +#define bli_cntx_anti_pref( cntx ) \ \ - ( (cntx)->membrk ) + ( (cntx)->anti_pref ) #define bli_cntx_thrloop( cntx ) \ \ ( (cntx)->thrloop ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + #if 1 #define bli_cntx_jc_way( cntx ) \ \ @@ -211,24 +217,24 @@ typedef struct cntx_s (cntx_p)->method = _method; \ } -#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ +#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a_block = _schema_a_block; \ } -#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ +#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b_panel = _schema_b_panel; \ } -#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ +#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c_panel = _schema_c_panel; \ } -#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \ { \ - (cntx_p)->membrk = _membrk; \ + (cntx_p)->anti_pref = _anti_pref; \ } #define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ @@ -241,6 +247,11 @@ typedef struct cntx_s (cntx_p)->thrloop[ BLIS_KR ] = 1; \ } +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -323,13 +334,17 @@ typedef struct cntx_s \ bli_cntx_method( cntx ) -#define bli_cntx_get_pack_schema_a( cntx ) \ +#define bli_cntx_get_pack_schema_a_block( cntx ) \ \ - bli_cntx_schema_a( cntx ) + bli_cntx_schema_a_block( cntx ) -#define bli_cntx_get_pack_schema_b( cntx ) \ +#define bli_cntx_get_pack_schema_b_panel( cntx ) \ \ - bli_cntx_schema_b( cntx ) + bli_cntx_schema_b_panel( cntx ) + +#define bli_cntx_get_pack_schema_c_panel( cntx ) \ +\ + bli_cntx_schema_c_panel( cntx ) #define bli_cntx_get_membrk( cntx ) \ \ @@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); // l1vkr_t ker_id, // cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ); +//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ); dim_t bli_cntx_get_num_threads( cntx_t* cntx ); dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); @@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, cntx_t* cntx ); -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_c( pack_t schema_c, - cntx_t* cntx ); +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ); +//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, +// cntx_t* cntx ); void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, @@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); @@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 32f99a832..2ada1556e 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ]; bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref ); + + // Explicitly set the anti-preference to FALSE. + bli_cntx_set_anti_pref( FALSE, cntx ); } diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 0d5992900..a7a69243e 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -877,6 +877,12 @@ bli_obj_width_stored( obj ) (obj).n_panel = n0; \ } +#define bli_obj_set_panel_dims( m0, n0, obj ) \ +{ \ + bli_obj_set_panel_length( m0, obj ); \ + bli_obj_set_panel_width( n0, obj ); \ +} + #define bli_obj_set_panel_dim( panel_dim, obj ) \ { \ (obj).pd = panel_dim; \ @@ -985,6 +991,7 @@ bli_obj_width_stored( obj ) #define bli_obj_induce_trans( obj ) \ { \ { \ + /* Induce transposition among basic fields. */ \ dim_t m_ = bli_obj_length( obj ); \ dim_t n_ = bli_obj_width( obj ); \ inc_t rs_ = bli_obj_row_stride( obj ); \ @@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj ) \ if ( bli_obj_is_upper_or_lower( obj ) ) \ bli_obj_toggle_uplo( obj ); \ +\ + /* Induce transposition among packed fields. */ \ + dim_t m_padded_ = bli_obj_padded_length( obj ); \ + dim_t n_padded_ = bli_obj_padded_width( obj ); \ + dim_t m_panel_ = bli_obj_panel_length( obj ); \ + dim_t n_panel_ = bli_obj_panel_width( obj ); \ +\ + bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \ + bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \ \ /* Note that this macro DOES NOT touch the transposition bit! If the calling code is using this macro to handle an object whose diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c4cfd3514..1a120d5da 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -975,9 +975,11 @@ typedef struct cntx_s opid_t family; ind_t method; - pack_t schema_a; - pack_t schema_b; - pack_t schema_c; + pack_t schema_a_block; + pack_t schema_b_panel; + pack_t schema_c_panel; + + bool_t anti_pref; dim_t thrloop[ BLIS_NUM_LOOPS ]; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index ce40bb105..5b7a70c3c 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - BLIS_PACKED_COL_PANELS_3MS, - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx ); } void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); } } @@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx ); } } @@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 3 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } } @@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) +{ + // Default to context for block-panel algorithm. + bli_gemm1mbp_cntx_init( dt, cntx ); +} + +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, FALSE, cntx ); +} + +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, TRUE, cntx ); +} + +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) { const ind_t method = BLIS_1M; @@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) // Initialize the context with packm-related kernels. bli_packm_cntx_init( dt, cntx ); + // Initialize the blocksizes according to the micro-kernel preference as + // well as the algorithm. if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_c_bp, 1m_r_pb. + + // Set the pack_t schemas for the c_bp or r_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_r_bp, 1m_c_pb. + + // Set the pack_t schemas for the r_bp or c_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); } + + // Set the anti-preference field to TRUE when executing a panel-block + // algorithm, and FALSE otherwise. This will cause higher-level generic + // code to establish (if needed) disagreement between the storage of C and + // the micro-kernel output preference so that the two will come back into + // agreement in the panel-block macro-kernel (which implemented in terms + // of the block-panel macro-kernel with some induced transpositions). + bli_cntx_set_anti_pref( is_pb, cntx ); } void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index f49744c3f..ea47968b1 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -65,6 +65,9 @@ void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ); void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm1m_cntx_finalize( cntx_t* cntx ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 4cb0bf6ba..a13d0d05a 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, + BLIS_PACKED_COL_PANELS_3MI, + cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, + BLIS_PACKED_COL_PANELS_4MI, + cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); } } diff --git a/frame/ind/oapi/bli_l3_1mbppb_oapi.c b/frame/ind/oapi/bli_l3_1mbppb_oapi.c new file mode 100644 index 000000000..e91f27ea2 --- /dev/null +++ b/frame/ind/oapi/bli_l3_1mbppb_oapi.c @@ -0,0 +1,85 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// -- gemmbp/gemmpb ------------------------------------------------------------ + +#undef GENFRONT +#define GENFRONT( opname, imeth, alg ) \ +\ +void PASTEMAC2(opname,imeth,alg) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *c ); \ + cntx_t cntx; \ + cntl_t* cntl_p; \ +\ + /* If the objects are in the real domain, execute the native + implementation. */ \ + if ( bli_obj_is_real( *c ) ) \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \ + return; \ + } \ +\ + /* Initialize a local 1m context for the current algorithm (bp or pb). */ \ + PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \ +\ + /* Create a control tree for the current algorithm (bp or pb). */ \ + cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \ +\ + /* Invoke the operation's front end using the context and control + tree we just created. */ \ + PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \ +\ + /* Free the control tree. Since the implementation will only make + copies of it (and not use it directly) we do not need to supply + a thread object. */ \ + bli_cntl_free( cntl_p, NULL ); \ +\ + /* Finalize the local context. */ \ + PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \ +} + +// gemm +GENFRONT( gemm, 1m, bp ) +GENFRONT( gemm, 1m, pb ) + diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index cb966d71c..36281f543 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \ PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ return; \ } \ +\ + /* A temporary hack to easily specify the 1m algorithm (block-panel or + panel-block). */ \ + if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ + { \ + bli_gemm1mbp( alpha, a, b, beta, c ); \ + return; \ + } \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 7f8ae194c..f5907d414 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 ) GENPROT_NO2OP( 4mh ) GENPROT_NO2OP( 4mb ) + +// +// Generate object-based prototypes for 1m methods that specify an algorithm +// (e.g., block-panel or panel-block). +// + +#undef GENPROT +#define GENPROT( imeth, alg ) \ +\ +void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \ + +GENPROT( 1m, bp ) +GENPROT( 1m, pb ) + diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index f686aa7ac..ff23a36f4 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const bool_t row_pref = !col_pref; \ + /*const bool_t row_pref = !col_pref;*/ \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \ ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ -\ - ctype_r beta_use; \ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ \ ctype_r* c_use; \ inc_t rs_c_use; \ @@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ \ /* If beta has a non-zero imaginary component OR if c is stored with - general stride OR if for some reason the storage of c is not the - preferred storage of the micro-kernel, then we compute the - alpha*a*b product into temporary storage and then accumulate that - result into c afterwards. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ - else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ \ if ( using_ct ) \ { \ + /* In the atypical cases, we compute the result into temporary + workspace ct and then accumulated it back to c at the end. */ \ +\ /* Set the strides of ct based on the preference of the underlying native real domain gemm micro-kernel. Note that we set the ct strides in units of complex elements. */ \ if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ else { rs_ct = nr; cs_ct = 1; } \ \ - beta_use = *zero_r; \ c_use = ( ctype_r* )ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ - } \ - else \ - { \ - /* In a typical case, we use the real part of beta and accumulate - directly into the output matrix c. */ \ - beta_use = beta_r; \ - c_use = ( ctype_r* )c; \ - rs_c_use = rs_c; \ - cs_c_use = cs_c; \ - } \ \ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ \ - /* Convert the strides from being in units of complex elements to - be in units of real elements. Note that we don't need to check for - general storage here because that case corresponds to the scenario - where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ - if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ - else rs_c_use *= 2; \ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ \ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ \ - /* The following gemm micro-kernel call implements the 1m method, - which induces a complex matrix multiplication by calling the - real matrix micro-kernel on micro-panels that have been packed - according to the 1e and 1r formats. */ \ -\ - /* c = beta * c + alpha_r * a * b; */ \ - rgemm_ukr \ - ( \ - k2, \ - alpha_r, \ - a_r, \ - b_r, \ - &beta_use, \ - c_use, rs_c_use, cs_c_use, \ - data, \ - cntx \ - ); \ -\ -\ - /* If necessary, accumulate the final result in ct back to c. */ \ - if ( using_ct ) \ - { \ dim_t i, j; \ \ + /* Accumulate the final result in ct back to c. */ \ for ( j = 0; j < nr; ++j ) \ for ( i = 0; i < mr; ++i ) \ { \ @@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ + else \ + { \ + /* In the typical cases, we use the real part of beta and + accumulate directly into the output matrix c. */ \ +\ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev new file mode 100644 index 000000000..3760bdd7c --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev @@ -0,0 +1,188 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /*const bool_t row_pref = !col_pref;*/ \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c index 7d746304c..c4ec44b54 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \ \ const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ const dim_t k2 = 2 * k; \ \ diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c index 92da659ca..ab5617795 100644 --- a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ @@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ From 69b4846ae9adb157c4171b52e159684db2867853 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 21 Feb 2017 15:33:39 -0600 Subject: [PATCH 04/23] Disabled experiment-related 1m code. Details: - Commented out code in frame/ind/oapi/bli_l3_3m4m1m_oapi.c that was specifically inserted to facilitate the benchmarking of 1m block-panel and panel-block algorithms. - Updates to test/3m4m/Makefile, runme.sh script, and test_gemm.c to reflect changes used/needed during benchmarking. --- frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 7 +++++ test/3m4m/Makefile | 43 +++++++++++++++++++------ test/3m4m/runme.sh | 49 ++++++++++++++++++++++------- test/3m4m/test_gemm.c | 18 +++++++++-- 4 files changed, 94 insertions(+), 23 deletions(-) diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 36281f543..b99ebda39 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -65,11 +65,18 @@ void PASTEMAC(opname,imeth) \ \ /* A temporary hack to easily specify the 1m algorithm (block-panel or panel-block). */ \ +/* if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ { \ bli_gemm1mbp( alpha, a, b, beta, c ); \ return; \ } \ + else if ( PASTEMAC(opname,imeth) == bli_gemm3m1 ) \ + { \ + bli_gemm1mpb( alpha, a, b, beta, c ); \ + return; \ + } \ +*/ \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 0b13b8eb1..433e745a7 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -107,8 +107,9 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a # BLAS library path(s). This is where the BLAS libraries reside. HOME_LIB_PATH := $(HOME)/flame/lib #MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 -MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 -ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib @@ -168,7 +169,7 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH) LINKER := $(CC) LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64 -LDFLAGS += -lgfortran -lm -lpthread -fopenmp +LDFLAGS += -lgfortran -lm -lrt -lpthread -fopenmp # Datatype @@ -211,13 +212,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=100 \ - -DP_END=1000 \ - -DP_INC=100 - -PDEF_MT := -DP_BEGIN=100 \ +PDEF_ST := -DP_BEGIN=40 \ -DP_END=2000 \ - -DP_INC=100 + -DP_INC=40 + +PDEF_MT := -DP_BEGIN=200 \ + -DP_END=10000 \ + -DP_INC=200 @@ -296,6 +297,8 @@ openblas-gemm-st: \ test_zgemm_openblas_st.x openblas-gemm-mt: \ + test_sgemm_openblas_mt.x \ + test_dgemm_openblas_mt.x \ test_cgemm_openblas_mt.x \ test_zgemm_openblas_mt.x @@ -306,6 +309,8 @@ mkl-gemm-st: \ test_zgemm_mkl_st.x mkl-gemm-mt: \ + test_sgemm_mkl_mt.x \ + test_dgemm_mkl_mt.x \ test_cgemm_mkl_mt.x \ test_zgemm_mkl_mt.x @@ -316,6 +321,8 @@ acml-gemm-st: \ test_zgemm_acml_st.x acml-gemm-mt: \ + test_sgemm_acml_mt.x \ + test_dgemm_acml_mt.x \ test_cgemm_acml_mt.x \ test_zgemm_acml_mt.x @@ -468,6 +475,12 @@ test_z%_openblas_st.o: test_%.c test_c%_openblas_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@ +test_d%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + +test_s%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + test_z%_openblas_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ @@ -487,6 +500,12 @@ test_z%_mkl_st.o: test_%.c test_c%_mkl_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@ +test_d%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + +test_s%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + test_z%_mkl_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ @@ -506,6 +525,12 @@ test_z%_acml_st.o: test_%.c test_c%_acml_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ +test_d%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + +test_s%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + test_z%_acml_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index 794f0ba00..3f5d89023 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -4,17 +4,21 @@ exec_root="test" out_root="output" -sys="blis" +#sys="blis" #sys="stampede" +sys="lonestar" #sys="wahlberg" # Bind threads to processors. #export OMP_PROC_BIND=true #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15" #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" -export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" #export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7" #export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7" +#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45" +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" # Modify LD_LIBRARY_PATH. if [ ${sys} = "blis" ]; then @@ -26,6 +30,11 @@ elif [ ${sys} = "stampede" ]; then # A hack to use libiomp5 with gcc. export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64" +elif [ ${sys} = "lonestar" ]; then + + # A hack to use libiomp5 with gcc. + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + elif [ ${sys} = "wahlberg" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HOME/flame/lib/acml/5.3.1/gfortran64_int64/lib" @@ -49,6 +58,14 @@ elif [ ${sys} = "stampede" ]; then ir_nt=1 # 1st loop nt=16 +elif [ ${sys} = "lonestar" ]; then + + jc_nt=2 # 5th loop + ic_nt=12 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=24 + elif [ ${sys} = "wahlberg" ]; then jc_nt=1 # 5th loop @@ -59,8 +76,10 @@ elif [ ${sys} = "wahlberg" ]; then fi # Threadedness to test. -threads="st mt" # st mt" -threads_r="st mt" # mt" +#threads="mt" +#threads_r="mt" +threads="st" +threads_r="st" # Datatypes to test. dts="z c" @@ -82,6 +101,14 @@ elif [ ${sys} = "stampede" ]; then test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" #test_impls="openblas mkl asm_blis" +elif [ ${sys} = "lonestar" ]; then + + test_impls="asm_blis 4mhw_blis 4m1a_blis 1m_blis 3m1_blis" + #test_impls="1m_blis 3m1_blis" + #test_impls="4m1a_blis" + #test_impls="mkl" + #test_impls="openblas mkl asm_blis" + elif [ ${sys} = "wahlberg" ]; then test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" @@ -90,7 +117,8 @@ fi # Real domain implementations to test. #test_impls_r="openblas mkl asm_blis" -test_impls_r="openblas asm_blis" +test_impls_r="asm_blis" +#test_impls_r="" # First perform real test cases. for th in ${threads_r}; do @@ -112,10 +140,11 @@ for th in ${threads_r}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" + # export MKL_NUM_THREADS=${nt} #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -124,7 +153,6 @@ for th in ${threads_r}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. @@ -165,10 +193,10 @@ for th in ${threads}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -177,7 +205,6 @@ for th in ${threads}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 7b16f584f..1f9ea036c 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -49,6 +49,7 @@ int main( int argc, char** argv ) dim_t p; dim_t p_begin, p_end, p_inc; int m_input, n_input, k_input; + ind_t ind; num_t dt; char dt_ch; int r, n_repeats; @@ -70,6 +71,8 @@ int main( int argc, char** argv ) dt = DT; + ind = IND; + p_begin = P_BEGIN; p_end = P_END; p_inc = P_INC; @@ -78,12 +81,21 @@ int main( int argc, char** argv ) n_input = -1; k_input = -1; -#if 0 + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 1 cntx_t cntx; + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + // Initialize a context for the current induced method and datatype. - bli_gemmind_cntx_init( IND, dt, &cntx ); + bli_gemmind_cntx_init( ind_mod, dt, &cntx ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); @@ -157,7 +169,7 @@ int main( int argc, char** argv ) #ifdef BLIS bli_ind_disable_all_dt( dt ); - bli_ind_enable_dt( IND, dt ); + bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX; From ca3a7924770d6cf203cce4ca9f5482e1d0d4e961 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 12:09:39 -0500 Subject: [PATCH 05/23] README.md update. Details: - Updated bibtex entries for 4th BLIS paper, and adds entries for 5th and 6th BLIS papers. --- README.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9bfa84285..1d7b0ce34 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving ``` A fourth paper, submitted to ACM TOMS, also exists, which proposes an -[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS: +[analytical model](http://dl.acm.org/citation.cfm?id=2925987) +([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)) +for determining blocksize parameters in BLIS: ``` @article{BLIS4, @@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an } ``` +A fifth paper, submitted to ACM TOMS, begins the study of so-called +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): + +``` +@article{BLIS5, + author = {Field G. {V}an~{Z}ee and Tyler Smith}, + title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods}, + journal = {ACM Transactions on Mathematical Software}, + year = {2017}, + note = {accepted} +} +``` + +A sixth paper, submitted to ACM TOMS, revisits the topic of the previous +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)): + +``` +@article{BLIS6, + author = {Field G. {V}an~{Z}ee}, + title = {Implementing high-performance complex matrix multiplication via the 1m method}, + journal = {ACM Transactions on Mathematical Software}, + note = {submitted} +} +``` + + Funding ------- From d5a5e003ea9b24bb6abf12e88862e8eb61ffb03d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 15:48:30 -0500 Subject: [PATCH 06/23] Fixed a trsm1m bug that affected right-side cases. Details: - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result was nondeterministic behavior (usually segmentation faults) for certain problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c which explicitly directed the virtual gemm micro-kernel to use temporary space if the storage preference of the [real domain] gemm ukernel did not match the storage of the output matrix C. In the context of gemm, this handling is not needed because agreement between the storage pref and the matrix is guaranteed by a high-level optimization in BLIS. However, this optimization is not applied to trsm because the storage of C is not necessarily the same as the storage of the micro-panels of B--both of which are updated by the micro-kernel during a trsm operation. Thus, the guarantee of storage/preference agreement is not in place for trsm, which means we must handle that case within the virtual gemm micro-kernel. - Comment updates and a minor macro change to bli_trsm*_cntx_init() for 3m1, 4m1a, and 1m. --- frame/ind/cntx/bli_trsmind_cntx.c | 30 +++++++++++--------- frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 21 ++++++-------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index a13d0d05a..96f9add60 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -73,10 +73,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -122,10 +121,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +172,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + // BLIS_PACKED_COL_PANELS_1R, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + // BLIS_PACKED_COL_PANELS_1E, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); } } diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index ff23a36f4..6279ab762 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - /*const bool_t row_pref = !col_pref;*/ \ + const bool_t row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -94,24 +94,19 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ -\ - /* Sanity check: These should never occur because storage/preference - agreement is handled at a higher level. */ \ - /* - if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ - */ \ -\ \ /* If beta has a non-zero imaginary component OR if c is stored with general stride, then we compute the alpha*a*b product into temporary storage and then accumulate that result into c afterwards. Note that the other two cases concerning disagreement between the storage of C - and the output preference of the micro-kernel, should never occur - (though we could handle them if they did occur). */ \ + and the output preference of the micro-kernel, should ONLY occur in + the context of trsm, whereby this virtual micro-kernel is called + directly from the trsm macro-kernel to update the micro-tile b11 + that exists within the packed row-panel of B. Indeed that is the + reason those cases MUST be explicitly handled. */ \ if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ - /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ From 940a707ac78de975110e17c95765e65b89aa5e10 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:38:42 -0500 Subject: [PATCH 07/23] Version file update (0.2.2) --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 566318cf2..ee1372d33 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1-82 +0.2.2 From a4f1d0b8801c114e9ef8be39df01e1b8d27ebcb3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:38:43 -0500 Subject: [PATCH 08/23] CHANGELOG update (0.2.2) --- CHANGELOG | 1179 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 1090 insertions(+), 89 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a361ceac3..c9a04cbde 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,18 +1,706 @@ -commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +commit 940a707ac78de975110e17c95765e65b89aa5e10 (HEAD -> master, tag: 0.2.2) +Author: Field G. Van Zee +Date: Tue May 2 16:38:42 2017 -0500 + + Version file update (0.2.2) + +commit d5a5e003ea9b24bb6abf12e88862e8eb61ffb03d (origin/master, origin/HEAD, origin/1m, 1m) +Author: Field G. Van Zee +Date: Tue May 2 15:48:30 2017 -0500 + + Fixed a trsm1m bug that affected right-side cases. + + Details: + - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result + was nondeterministic behavior (usually segmentation faults) for certain + problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The + cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c + which explicitly directed the virtual gemm micro-kernel to use temporary + space if the storage preference of the [real domain] gemm ukernel did + not match the storage of the output matrix C. In the context of gemm, + this handling is not needed because agreement between the storage pref + and the matrix is guaranteed by a high-level optimization in BLIS. + However, this optimization is not applied to trsm because the storage + of C is not necessarily the same as the storage of the micro-panels of + B--both of which are updated by the micro-kernel during a trsm + operation. Thus, the guarantee of storage/preference agreement is not + in place for trsm, which means we must handle that case within the + virtual gemm micro-kernel. + - Comment updates and a minor macro change to bli_trsm*_cntx_init() for + 3m1, 4m1a, and 1m. + +commit e80993e71f4d571e9650a8e90ed386e32059eae5 +Merge: a509fbd5 ca3a7924 +Author: Field G. Van Zee +Date: Tue May 2 12:30:28 2017 -0500 + + Merge branch 'master' into 1m + +commit ca3a7924770d6cf203cce4ca9f5482e1d0d4e961 +Author: Field G. Van Zee +Date: Tue May 2 12:09:39 2017 -0500 + + README.md update. + + Details: + - Updated bibtex entries for 4th BLIS paper, and adds entries for 5th + and 6th BLIS papers. + +commit 6e7de6ef84babb273dc5528a9b9d01f0febe394b +Author: Field G. Van Zee +Date: Fri Mar 17 12:10:24 2017 -0500 + + Minor updates to test/3m4m. + + Details: + - Updated initial problem size and increment in Makefile. + - Updated code in test_gemm.c to correctly query kc from context. + +commit f484c6cd4389dc7ae5b972849e12e98ad5bbf9a4 +Author: Field G. Van Zee +Date: Fri Mar 17 12:07:27 2017 -0500 + + Whitespace reformatting to armv8a kernels file. + + Details: + - Updated formatting of function signature/header in + kernels/armv8a/3/bli_gemm_opt_4x4.c. + +commit a509fbd5ac04fafd4e51b43d2f59ca56432dc212 +Merge: 69b4846a 513944e4 +Author: Field G. Van Zee +Date: Tue Feb 21 17:06:16 2017 -0600 + + Merge branch 'master' into 1m + +commit 69b4846ae9adb157c4171b52e159684db2867853 +Author: Field G. Van Zee +Date: Tue Feb 21 15:33:39 2017 -0600 + + Disabled experiment-related 1m code. + + Details: + - Commented out code in frame/ind/oapi/bli_l3_3m4m1m_oapi.c that was + specifically inserted to facilitate the benchmarking of 1m block-panel + and panel-block algorithms. + - Updates to test/3m4m/Makefile, runme.sh script, and test_gemm.c to + reflect changes used/needed during benchmarking. + +commit 513944e4a951d8823b4de161b86ad7a965b4d99b +Merge: 8b462a0e 0e18f68c +Author: Devin Matthews +Date: Mon Feb 20 10:04:33 2017 -0500 + + Merge pull request #118 from devinamatthews/master + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 0e18f68cf12eb9189ba901a20040b1cdae417670 +Author: Devin Matthews +Date: Mon Feb 20 09:03:21 2017 -0600 + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 8b462a0e8c3e9252f0401940849e53cc772256fa +Merge: c362afc5 7d42fc07 +Author: Devin Matthews +Date: Sun Feb 19 23:03:03 2017 -0500 + + Merge pull request #117 from devinamatthews/master + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit 7d42fc0796ef0c010375fd8e59b1240ba41ce4d2 +Author: Devin Matthews +Date: Sun Feb 19 21:10:55 2017 -0500 + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit c362afc525bab4050581d1b0fcea2fe4d582c608 +Author: Field G. Van Zee +Date: Thu Feb 9 11:54:59 2017 -0600 + + Added missing "level-0" BLAS [sd]cabs1_(). + + Details: + - Fixed issue #115 by adding implementations for scabs1_() and dcabs1_() + to the BLAS compatibility layer. Thanks to heroxbd for pointing out + their absence. + +commit 018180c938c32efbeaaf626ba71ec5b780664db1 +Author: Field G. Van Zee +Date: Wed Feb 8 11:20:52 2017 -0600 + + Fixed a minor bug in configure (issue #114). + + Details: + - Fixed a bug in the configure script whereby a non-preferred value for + --enable-threading would cause problems in common.mk vis-a-vis detecting + which threading model was chosen. Thanks to heroxbd for reporting this + issue. + +commit ddf45e71770c55ea4a58ca24ea4913fe5d8beb9b +Merge: a6ab91bc 78e1b16e +Author: Devin Matthews +Date: Fri Jan 27 14:25:40 2017 -0600 + + Merge pull request #113 from devinamatthews/knl_thread_params + + Change default threading parameters for KNL. + +commit 78e1b16e16d589ed31b2e712115ee282097f114d +Author: Devin Matthews +Date: Fri Jan 27 14:22:20 2017 -0600 + + Change default threading parameters for KNL. + +commit 1c732d3ddc4ac0861d3b0e0dd15eb7e071615502 +Author: Field G. Van Zee +Date: Wed Jan 25 16:25:46 2017 -0600 + + Added 1m-specific APIs for bp, pb gemm algorithms. + + Details: + - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the + body of bli_gemm_cntl_create() replaced with a call to the former. + - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, + bli_cntl_free() can check if the thread parameter is NULL, and if so, + call the latter, and otherwise call the former. + - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in + terms of bli_gemm1mxx_cntx_init(), which behaves the same as + bli_gemm1m_cntx_init() did before, except that an extra bool parameter + (is_pb) is used to support both bp and pb algorithms (including to + support the anti-preference field described below). + - Added support for "anti-preference" in context. The anti_pref field, + when true, will toggle the boolean return value of routines such as + bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of + causing BLIS to transpose the operation to achieve disagreement (rather + than agreement) between the storage of C and the micro-kernel output + preference. This disagreement is needed for panel-block implementations, + since they induce a transposition of the suboperation immediately before + the macro-kernel is called, which changes the apparent storage of C. For + now, anti-preference is used only with the pb algorithm for 1m (and not + with any other non-1m implementation). + - Defined new functions, + bli_cntx_l3_ukr_eff_prefers_storage_of() + bli_cntx_l3_ukr_eff_dislikes_storage_of() + bli_cntx_l3_nat_ukr_eff_prefers_storage_of() + bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() + which are identical to their non-"eff" (effectively) counterparts except + that they take the anti-preference field of the context into account. + - Explicitly initialize the anti-pref field to FALSE in + bli_gks_cntx_set_l3_nat_ukr_prefs(). + - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel + in terms of the existing block-panel macro-kernel _ker_var2(). This + technique requires inducing transposes on all operands and swapping + the A and B. + - Changed bli_obj_induce_trans() macro so that pack-related fields are + also changed to reflect the induced transposition. + - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily + specify the 1m algorithm (block-panel or panel-block). + - Renamed the following cntx_t-related macros: + bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() + bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() + bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() + and updated all instantiations. Also updated the field names in the + cntx_t struct. + - Comment updates. + +commit a6ab91bc61432490fadf18d596de4589645f37dd +Merge: 145a551d 7f31a630 +Author: Field G. Van Zee +Date: Wed Nov 30 09:26:58 2016 -0600 + + Merge pull request #111 from figual/master + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 7f31a6307b7bd35f913c895947552c3a176f789b +Author: Francisco Igual +Date: Sun Nov 27 14:40:47 2016 +0100 + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 126482a3b609b9ad7026ba348f6c4bf6a29be8a1 +Author: Field G. Van Zee +Date: Fri Nov 25 18:29:49 2016 -0600 + + Implemented the 1m method. + + Details: + - Implemented the 1m method for inducing complex domain matrix + multiplication. 1m support has been added to all level-3 operations, + including trsm, and is now the default induced method when native + complex domain gemm microkernels are omitted from the configuration. + - Updated _cntx_init() operations to take a datatype parameter. This was + needed for the corresponding function for 1m (because 1m requires us + to choose between column-oriented or row-oriented execution, which + requires us to query the context for the storage preference of the + gemm microkernel, which requires knowing the datatype) but I decided + that it made sense for consistency to add the parameter to all other + cntx initialization functions as well, even though those functions + don't use the parameter. + - Updated bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs() to take + a second scalar for each blocksize entry. The semantic meaning of the + two scalars now is that the first will scale the default blocksize + while the second will scale the maximum blocksize. This allows scaling + the two independently, and was needed to support 1m, which requires + scaling for a register blocksize but not the register storage + blocksize (ie: "packdim") analogue. + - Deprecated bli_blksz_reduce_dt_to() and defined two new functions, + bli_blksz_reduce_def_to() and bli_blksz_reduce_max_to(), for reducing + default and maximum blocksizes to some desired blocksize multiple. + These functions are needed in the updated definitions of + bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs(). + - Added support for the 1e and 1r packing schemas to packm, including + 1e/1r packing kernels. + - Added a minor optimization to bli_gemm_ker_var2() that allows, under + certain circumstances (specifically, real domain beta and row- or + column-stored matrix C), the real domain macrokernel and microkernel + to be called directly, rather than using the virtual microkernel + via the complex domain macrokernel, which carries a slight additional + amount of overhead. + - Added 1m support to the testsuite. + - Added 1m support to Makefile and runme.sh in test/3m4m. Also simplified + some code in test_gemm.c driver. + +commit 145a551d524ae5492667a05fc248923d922df850 +Author: Field G. Van Zee +Date: Wed Nov 23 17:59:06 2016 -0600 + + Switched to simpler trsm_r implementation. + + Details: + - Disabled the implementation of trsm_r that allows the right-hand matrix + B to be trianglar, and switched to the implementation that simply + transposes the operation (and thus the storage of C) in order to recast + the operation as trsm_l. This avoids the need to use trsm_rl and trsm_ru + macrokernels, which require an awkward swapping of MR and NR. For now, + the support for trsm_r macrokernels, via separate control trees, remains. + - Modified bli_config_macro_defs.h so that BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + is defined by default. This is mostly a safety precaution in case someone + tries to switch back to the previous trsm_r implementation, but also + serves as a convenience on some systems where one does not naturally + choose blocksizes in a way that satisfies MC % NR = 0 and NC % MR = 0. + +commit b3e58ee30307cf1e11529f2113acb9abbeda25af +Author: Field G. Van Zee +Date: Wed Nov 23 17:58:26 2016 -0600 + + Reimplemented 4x12 haswell ukernels (real only). + + Details: + - Replaced permutation-based implementations in bli_gemm_asm_d4x12.c, which + defines 4x24 single real and 4x12 double real gemm microkernels, with + broadcast-based implementations. (The previous microkernel file has been + moved to an 'old' subdirectory.) + +commit bdc0a264d2fb5940bfd09298b1de823674a39053 +Author: Field G. Van Zee +Date: Wed Nov 16 14:13:08 2016 -0600 + + Adjusted stride selection of ct in macrokernels. + + Details: + - Updated the changes introduced in 618f433 so that the strides of the + temporary microtile ct used in the macrokernels is determined based + on the storage preference of the microkernel (via the new functions + below), rather than the strides of c. In almost all cases, presently, + this change results in no net effect, as a high-level optimization + in the _front() functions aligns the storage of c to that of the + microkernel's preference. However, I encountered some cases where + this is not always the case in some development code that has yet + to be committed, and therefore I'm generalizing the framework code + in advance. + - Defined two new functions in bli_cntx.c: + bli_cntx_l3_ukr_prefers_rows_dt() + bli_cntx_l3_ukr_prefers_cols_dt() + which return bool_t's based on the current micro-kernel's storage + preferences. For induced methods, the preference of the underlying + real domain microkernel is returned. + - Updated definition of bli_cntx_l3_ukr_dislikes_storage_of(), and + by proxy bli_cntx_l3_ukr_prefers_storage_of(), to be in terms of + the above functions, rather than querying the preferences of the + native microkernel directly (which did the wrong thing for induced + methods). + +commit 031978d2647cf08316858baf29c84ebba9c3133e +Author: Field G. Van Zee +Date: Wed Nov 16 14:04:33 2016 -0600 + + Fixed inactive trsm_r blocksize constraint code. + + Details: + - Changed a cpp macro that was meant to prevent using certain trsm_r code + if BLIS_RELAX_MCNR_NCMR_CONSTRAINTS was defined. It was actually coded + incorrectly at first. I've now fixed its location and changed its + consequence to a compile-time #error message. + +commit 6b5a4032d2e3ed29a272c7f738b7e3ed6657e556 +Merge: 3b524a08 a8220e3a +Author: Field G. Van Zee +Date: Thu Nov 10 15:28:24 2016 -0600 + + Merge pull request #109 from devinamatthews/omp_num_threads + + Add automatic loop thread assignment. + +commit a8220e3a86433b5d76789e32ea7ca014a11b6d17 +Author: Devin Matthews +Date: Thu Nov 10 14:19:34 2016 -0600 + + - Fix typo in bli_cntx.c + - Bump BLIS_DEFAULT_NR_THREAD_MAX to 4 + +commit c05b3862f6241486442b313eff0c8bee7b5e1274 +Author: Devin Matthews +Date: Fri Nov 4 15:48:02 2016 -0500 + + Add automatic loop thread assignment. + + - Number of threads is determined by BLIS_NUM_THREADS or OMP_NUM_THREADS, but can be overridden by BLIS_XX_NT as before. + - Threads are assigned to loops (ic, jc, ir, and jc) automatically by weighted partitioning and heuristics, both of which are tunable via bli_kernel.h. + - All level-3 BLAS covered. + +commit 3b524a08e3fb8380e7b8b2ba835312c51a331570 +Author: Field G. Van Zee +Date: Wed Nov 2 17:45:18 2016 -0500 + + Consolidated 3m1/4m1 gemmtrsm, trsm ukernel code. + + Details: + - Consolidated the macros that define the lower and upper versions of the + gemmtrsm microkernels into a single macro that is instantiated twice. + Did this for both 3m1 and 4m1 microkernels. + - Consolidated lower and upper versions of the trsm microkernels for 3m1 + and 4m1 into single files (each). + +commit ead231aca635deb3db270f118454e4222c627f31 +Merge: d25e6f8b 62987f60 +Author: Field G. Van Zee +Date: Wed Nov 2 13:03:50 2016 -0500 + + Merge pull request #108 from devinamatthews/patch-2 + + Update .travis.yml with additional tests + +commit 62987f60a6a6ff0a75b31d0404f493593ce35ccc +Author: Devin Matthews +Date: Wed Nov 2 11:20:37 2016 -0500 + + Allow KNL to fail + +commit 8f9010542c751ae3cbfe6121cb011d8985c1e00d +Author: Devin Matthews +Date: Wed Nov 2 11:18:32 2016 -0500 + + Fix some problems with OSX builds: + + - Update CPU detection for Intel archs (esp. Skylake) + - Allow clang for the reference config + +commit d25e6f8b63c57f30b8a67dffbf4995977cf9f235 +Author: Field G. Van Zee +Date: Tue Nov 1 14:35:15 2016 -0500 + + Can disable trsm_r-specific blocksize constraints. + + Details: + - Added cpp guards around the constraints in bli_kernel_macro_defs.h + that enforce MC % NR = 0 and NC % MR = 0. These constraints are ONLY + needed when handling right-side trsm by allowing the matrix on the + right (matrix B) to be triangular, because it involves swapping + register, but not cache, blocksizes (packing A by NR and B by MR) + and then swapping the operands to gemmtrsm just before that kernel + is called. It may be useful to disable these constraints if, for + example, the developer wishes to test the configuration with + a different set of cache blocksizes where only MC % MR = 0 and + NC % NR = 0 are enforced. + - In summary, #defining BLIS_RELAX_MCNR_NCMR_CONSTRAINTS will bypass + the enforcement of MC % NR = 0 and NC % MR = 0. + +commit 1a67e3688edb073a9d44c160e7b0798e08796b8a +Author: Devin Matthews +Date: Tue Nov 1 13:53:18 2016 -0500 + + Bogus commit + + Need to trigger another Travis build. + +commit 2cd82d67b372cad1bed50cfd99e524f1f40b4e24 +Author: Devin Matthews +Date: Tue Nov 1 13:25:50 2016 -0500 + + Some fixes for .travis.yml + + - Switch to gcc-5 to support knl + - Don't run tests in parallel -- it is super slow. + - Use clang on OSX since gcc is only a zombie husk. + +commit a3db4e6bdfe745083acf704ab0f51f74ea869538 +Author: Devin Matthews +Date: Tue Nov 1 10:33:18 2016 -0500 + + Update .travis.yml with additional tests + + - Test knl configuration (without running of course). + - Test openmp and pthreads threading for auto configuration with 4 threads. + - Test auto configuration with and without pthreads on OSX. + - Also, run make in parallel. + + I don't know how the `addons:` section works on OSX; hopefully it is just ignored. + +commit 8a11a2174a1a5b9426f13bbc5338dc86ab138cdd +Author: Field G. Van Zee +Date: Mon Oct 31 19:07:55 2016 -0500 + + Updates to non-default haswell microkernels. + + Details: + - Updated s and d microkernels in bli_gemm_asm_d8x6.c to relax alignment + constraints. + - Added missing c and z microkernels, which are based on the corresponding + kernels in the d6x8 set. + - This completes the d8x6 set (which may be used for situations when it + is desirable to have a microkernel with a column preference). + +commit 618f4331eba209803ecab99747872eceb1b5f091 +Author: Field G. Van Zee +Date: Mon Oct 31 14:40:51 2016 -0500 + + Align strides of ct in macrokernels to that of c. + + Details: + - Previously, rs_ct and cs_ct, the strides of the temporary microtile used + primarily in the macrokernels' edge case handling, were unconditionally + set to 1 and MR, respectively. However, Devin Matthews noted that this + ought to be changed so that the strides of ct were in agreement with the + strides of C. (That is, if C was row-stored, then ct should be accessed + as by rows as well.) The implicit assumption is that the strides of C + have already been adjusted, via induced transposition, if the storage + preference of the microkernel is at odds with the storage of C. So, if + the microkernel prefers row storage, the macrokernel's interior cases + would present row-stored (ideal) microkernel subproblems to the + microkernel, but for edge cases, it would still see column-stored + subproblems (not ideal). This commit fixes this issue. Thanks to Devin + for his suggestion. + +commit 630391002325a589063aec2ab0a7d89ef2e178c0 +Merge: 956b3edf 216206c1 +Author: Field G. Van Zee +Date: Tue Oct 25 19:34:51 2016 -0500 + + Merge pull request #105 from devinamatthews/knl + + Support for Intel Knight's Landing. + +commit 216206c1d328a865c2192e35a4df6e9aff79a85b +Author: Devin Matthews +Date: Tue Oct 25 13:56:18 2016 -0500 + + Fix up for merge to master. + +commit 11eb7957abbcdf02d5e312898e094260eadb1209 +Merge: cd5b6681 956b3edf +Author: Devin Matthews +Date: Tue Oct 25 13:51:07 2016 -0500 + + Merge branch 'master' into knl + + # Conflicts: + # frame/thread/bli_thread.h + +commit cd5b6681838899283cd94e5427dfda206e7fbabe +Author: Devin Matthews +Date: Tue Oct 25 13:49:27 2016 -0500 + + Don't use %rbp in KNL packing kernels. + +commit 956b3edf8eb09480f31f2e861c1b10f9ecbb2e52 +Merge: b7e41d71 0662a3c1 +Author: Field G. Van Zee +Date: Tue Oct 25 13:02:57 2016 -0500 + + Merge pull request #104 from devinamatthews/misspellings + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit 0662a3c1b1f4644a86bf8e5073d1391808c91b4a +Author: Devin Matthews +Date: Tue Oct 25 12:42:44 2016 -0500 + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit b7e41d71b07d2af6d22d632c70e0c5f7ce46852c +Merge: 4bd905bd 5117d444 +Author: Field G. Van Zee +Date: Mon Oct 24 16:47:46 2016 -0500 + + Merge pull request #103 from devinamatthews/patch-1 + + Change .align to .p2align in Bulldozer ukernels. + +commit 5117d444f7f3a2bc327f067926eaf2398212edda +Author: Devin Matthews +Date: Mon Oct 24 16:20:47 2016 -0500 + + Change .align to .p2align in Bulldozer ukernels + + Apparently OSX doesn't allow .align directives for >16B, so I've changed these to their .p2align counterparts. + +commit 4bd905bd4597e0ad7bedf31e25e779d3e2dfda29 +Merge: 936d5fdc 7f32dd57 +Author: Field G. Van Zee +Date: Fri Oct 21 14:48:44 2016 -0500 + + Merge pull request #93 from ShadenSmith/config_check + + Adds sanity check to configuration choice. + +commit 936d5fdc26c6c4dab199a8d11fde948975cfa1d6 +Author: Field G. Van Zee +Date: Fri Oct 21 14:34:27 2016 -0500 + + Fixed multithreading compilation bug in 970745a. + + Details: + - Moved the definition of the cpp macro BLIS_ENABLE_MULTITHREADING + from bli_thread.h to bli_config_macro_defs.h. Also moved the + sanity check that OpenMP and POSIX threads are not both enabled. + - Thanks to Krzysztof Drewniak for reporting this bug. + +commit 8feb0f85a674e84bec2417486e3bcea584b14c04 +Author: Field G. Van Zee +Date: Wed Oct 19 16:05:41 2016 -0500 + + Removed auto-prototyping of malloc()/free() substitutes. + + Details: + - Removed the header file, bli_malloc_prototypes.h, which automatically + generated prototypes for the functions specified by the following + cpp macros: + BLIS_MALLOC_INTL + BLIS_FREE_INTL + BLIS_MALLOC_POOL + BLIS_FREE_POOL + BLIS_MALLOC_USER + BLIS_FREE_USER + These prototypes were originally provided primarily as a convenience + to those developers who specified their own malloc()/free() substitutes + for one or more of the following. However, we generated these prototypes + regardless, even when the default values (malloc and free) of the + macros above were used. A problem arose under certain circumstances + (e.g., gcc in C++ mode on Linux with glibc) when including blis.h that + stemmed from the "throw" specification which was added to the glibc's + malloc() prototype, resulting in a prototype mismatch. Therefore, going + forward, developers who specify their own custom malloc()/free() + substitutes must also prototype those substitutes via bli_kernel.h. + Thanks to Krzysztof Drewniak for reporting this bug, and Devin Matthews + for researching the nature and potential solutions. + +commit 970745a5fc7c29de3e202988e5eb104fabca4fdc +Author: Field G. Van Zee +Date: Wed Oct 19 15:58:03 2016 -0500 + + Reorganized typedefs to avoid compiler warnings. + + Details: + - Relocated membrk_t definition from bli_membrk.h to bli_type_defs.h. + - Moved #include of bli_malloc.h from blis.h to bli_type_defs.h. + - Removed standalone mtx_t and mutex_t typedefs in bli_type_defs.h. + - Moved #include of bli_mutex.h from bli_thread.h to bli_typedefs.h. + - The redundant typedefs of membrk_t and mtx_t caused a warning on some C + compilers. Thanks to Tyler Smith for reporting this issue. + +commit 28b2af8a71133ce68774e153b6e05afb05affba8 +Author: Field G. Van Zee +Date: Thu Oct 13 14:50:08 2016 -0500 + + Added disabled code to print thrinfo_t structures. + + Details: + - Added cpp-guarded code to bli_thrcomm_openmp.c that allows a curious + developer to print the contents of the thrinfo_t structures of each + thread, for verification purposes or just to study the way thread + information and communicators are used in BLIS. + - Enabled some previously-disabled code in bli_l3_thrinfo.c for freeing + an array of thrinfo_t* values that is used in the new, cpp-guarde code + mentioned above. + - Removed some old commented lines from bli_gemm_front.c. + +commit 11eed3f683d09e65f721567b346b0f733bff9a64 +Author: Field G. Van Zee +Date: Thu Oct 13 14:23:23 2016 -0500 + + Fixed a configure -t omp/openmp bug from fd04869. + + Details: + - Forgot to update certain occurrences of "omp" in common.mk during + commit fd04869, which changed the preferred configure option string + for enabling OpenMP from "omp" to "openmp". + +commit 9cda6057eaa16a24ac8785a9fa167df6c9edba44 +Author: Field G. Van Zee +Date: Tue Oct 11 13:21:26 2016 -0500 + + Removed previously renamed/old files. + + Details: + - Removed frame/base/bli_mem.c and frame/include/bli_auxinfo_macro_defs.h, + both of which were renamed/removed in 701b9aa. For some reason, these + files survived when the compose branch was merged back into master. + (Clearly, git's merging algorithm is not perfect.) + - Removed frame/base/bli_mem.c.prev (an artifact of the long-ago changed + memory allocator that I was keeping around for no particular reason). + +commit 22377abd84b9e560ffe1c4e4d284eb443ddb7133 +Author: Field G. Van Zee +Date: Mon Oct 10 13:43:56 2016 -0500 + + Fixed bli_gemm() segfault on empty C matrices. + + Details: + - Fixed a bug that would manifest in the form of a segmentation fault + in bli_cntl_free() when calling any level-3 operation on an empty + output matrix (ie: m = n = 0). Specifically, the code previously + assumed that the entire control tree was built prior to it being + freed. However, if the level-3 operation performs an early exit, the + control tree will be incomplete, and this scenario is now handled. + Thanks to Elmar Peise for reporting this bug. + +commit 0b571cd94d9b175331c9453258a6b1389a718ae8 +Author: Field G. Van Zee +Date: Thu Oct 6 14:48:15 2016 -0500 + + Fixed segfault in bli_free_align() for NULL ptrs. + + Details: + - Fixed a bug in bli_free_align() caused by failing to handle NULL pointers + up-front, which led to performing pointer arithmetic on NULL pointers in + order to free the address immediately before the pointer. Thanks to Devin + Matthews for reporting this bug. + +commit 4fb9b4ef2e4cf2626a6e000a41628fb823f16da8 +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:35 2016 -0500 + + CHANGELOG update (0.2.1) + +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (tag: 0.2.1) Author: Field G. Van Zee Date: Wed Oct 5 14:41:34 2016 -0500 Version file update (0.2.1) -commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) -Merge: 8696987 6f71cd3 +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 +Merge: 86969873 6f71cd34 Author: Field G. Van Zee Date: Wed Oct 5 13:35:01 2016 -0500 Merge branch 'compose' -commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) -Merge: c0630c4 8d55033 +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose, compose) +Merge: c0630c40 8d55033c Author: Field G. Van Zee Date: Tue Oct 4 15:53:46 2016 -0500 @@ -92,14 +780,20 @@ Date: Tue Sep 27 14:14:11 2016 -0500 should be considered deprecated. commit 9424af87209e4e435e2e742430945152690170b0 -Merge: efa7341 c0630c4 +Merge: efa7341d c0630c40 Author: Field G. Van Zee Date: Tue Sep 27 12:51:08 2016 -0500 Merge branch 'compose' +commit 7f32dd57c6bd41c0704341752842277dd6a4c8eb +Author: Shaden Smith +Date: Sat Sep 17 11:33:57 2016 -0500 + + Adds sanity check to configuration choice. + commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e -Merge: 121c39d e1453f6 +Merge: 121c39d4 e1453f68 Author: Field G. Van Zee Date: Fri Sep 16 11:01:57 2016 -0500 @@ -113,7 +807,7 @@ Date: Fri Sep 16 09:29:28 2016 -0500 Fixes broken URL in README.md -commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +commit c0630c4024b08750043a2942a3e8a037aa6b6259 Author: Field G. Van Zee Date: Mon Sep 12 13:59:02 2016 -0500 @@ -125,7 +819,7 @@ Date: Mon Sep 12 13:59:02 2016 -0500 - Minor changes to frame/thread/bli_thrinfo.h. commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 -Merge: 3550981 121c39d +Merge: 35509818 121c39d4 Author: Field G. Van Zee Date: Tue Sep 6 15:47:13 2016 -0500 @@ -287,7 +981,7 @@ Date: Fri Aug 26 19:04:45 2016 -0500 implementations can slow down the testsuite considerably. commit 73517f522b69de429dd7f3df60a70c068149ab28 -Merge: c6f5c21 50293da +Merge: c6f5c215 50293da3 Author: Field G. Van Zee Date: Tue Aug 23 13:46:59 2016 -0500 @@ -315,7 +1009,7 @@ Date: Tue Aug 23 13:38:36 2016 -0500 which requires "0" or "1". commit c6f5c215ee793d03ea834469fc2adc53feaffc42 -Merge: d52cb76 16a4c7a +Merge: d52cb767 16a4c7a8 Author: Field G. Van Zee Date: Mon Aug 22 17:33:02 2016 -0500 @@ -333,8 +1027,48 @@ Date: Fri Aug 19 11:38:36 2016 -0500 to type mismatch, and in the case of pthreads, a missing function argument. The bugs are fairly recent, introduced in a017062. +commit c8e4ef93953ba2b79fb7e0973c08469c0e28a2cd +Author: Devin Matthews +Date: Wed Aug 3 16:13:03 2016 -0500 + + Add prefetchw to 30x8 kernel. + +commit 4b5a2f3d6e7ffeb5cc2be8448554f5c2083ad68f +Merge: 380736bf 9f52a587 +Author: Devin Matthews +Date: Wed Aug 3 16:09:51 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + + # Conflicts: + # kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c + +commit 380736bfe955efbdd7274c90b6fd635688e83bc4 +Author: Devin Matthews +Date: Wed Aug 3 16:08:28 2016 -0500 + + Add (new) 30x8 KNL kernel and fix non-scatter prefetch bug. + +commit 9f52a587dee855daa73c194e41b6951416544e9a +Author: Devin Matthews +Date: Wed Aug 3 16:03:53 2016 -0500 + + Try prefetchw[t1] instead of regular prefetch for C. + +commit 8945a1512d366bc6a8a85718d12cbf5de6f2898b +Author: Devin Matthews +Date: Wed Aug 3 11:28:24 2016 -0500 + + This version gets ~1550 GFLOPs on KNL wuth 16x4. + +commit 6ce4c022ebdea00c2b951090e3c2e9e88735b9ce +Author: Devin Matthews +Date: Wed Jul 27 16:26:36 2016 -0500 + + Switch back to 24x8. I could only squeeze 24.5GFLOP out of 8x24, and scalability is not improved. + commit d52cb7671509592a8078729477b40b60380518a2 -Merge: 95abea4 c31b1e7 +Merge: 95abea46 c31b1e7b Author: Field G. Van Zee Date: Wed Jul 27 16:04:55 2016 -0500 @@ -357,8 +1091,87 @@ Date: Wed Jul 27 15:58:07 2016 -0500 - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). - Minor update (vis-a-vis contexts) to driver code in test/3m4m. +commit b8f2b55532849d45d379afbdd05a52ff6100800d +Author: Devin Matthews +Date: Wed Jul 27 15:22:55 2016 -0500 + + Try an 8x24 kernel for the hell of it. + +commit 7ede5863ae3567f7c0852efc2d5cd649ca19e0f3 +Author: Devin Matthews +Date: Wed Jul 27 13:41:27 2016 -0600 + + Allocate pack buffer on MCDRAM for KNL. + +commit ad89ed2e829c7b261d8ba0998a3cb83ad576ee04 +Merge: 2c9de740 81e2b05f +Author: Devin Matthews +Date: Wed Jul 27 11:45:40 2016 -0500 + + Merge branch 'knl' of github.com:devinamatthews/blis into knl + +commit 2c9de740edb66c4692c200731763bbd1d3171ccb +Author: Devin Matthews +Date: Wed Jul 27 11:44:54 2016 -0500 + + This version gets ~26GF on one core. + +commit 81e2b05f31bca4e1e1676e7b533d1868d9f9be33 +Author: Devin Matthews +Date: Wed Jul 27 11:39:05 2016 -0500 + + Add optimized packing kernels for KNL. + +commit a7d8ca97b8d835c32d90ff20a565c82733f014a8 +Author: Devin Matthews +Date: Mon Jul 25 15:15:13 2016 -0500 + + All fixed. + +commit 963d0393b023f4134bb0c682923faf9964c0e645 +Author: Devin Matthews +Date: Mon Jul 25 14:40:53 2016 -0500 + + Add 24xk pack kernel. + +commit 117b76739afba481768897d2580f8365d3345417 +Author: Devin Matthews +Date: Mon Jul 25 13:53:07 2016 -0500 + + In the midst of debugging. + +commit 8c0a4fd1d3535d608a9a309a61ffee0a73c3646f +Author: Devin Matthews +Date: Mon Jul 25 13:09:24 2016 -0500 + + Fix some row/column confusion. + +commit c44f9f96930312125b15e64c326ab5ab5cc02633 +Author: Devin Matthews +Date: Mon Jul 25 12:02:24 2016 -0500 + + Simplify displacements -- clang assembler was badly botching EVEX compressed displacements giving false alarms for instruction length. + +commit e0cce177cc1b47ec9f11ac0556241feaa3564df1 +Author: Devin Matthews +Date: Mon Jul 25 10:02:25 2016 -0500 + + Minor fixes for 8x24 KNL kernel. + +commit 65735bbedf75784c48bd11e05b3fdc98fc66b4bc +Author: Devin Matthews +Date: Sun Jul 24 21:50:32 2016 -0500 + + Switch to 24x8 kernel, unrolled by 16. + +commit 45d5dc97177117220bd9dd0abf85aafc185acad1 +Author: Devin Matthews +Date: Sun Jul 24 14:25:26 2016 -0500 + + Add 24x8 "KNC-style" kernel for KNL. + commit 95abea46f86816fddfc9ff0abfa52880801461be -Merge: d0dfe5b a017062 +Merge: d0dfe5b5 a017062f Author: Field G. Van Zee Date: Sat Jul 23 15:38:33 2016 -0500 @@ -396,8 +1209,39 @@ Date: Fri Jul 22 17:02:59 2016 -0500 single-threaded execution. This new API is employed within functions such as bli_membrk_acquire_[mv]() and bli_membrk_release(). +commit 8ff2e069c48c12fd06b9c48c6b3aeb4ea9b0e6e1 +Author: Devin Matthews +Date: Fri Jul 22 16:22:26 2016 -0500 + + Add 4x unrolled variant for KNL microkernel. + +commit 9cb2ed9b0c25f31a22c1c9719b062fa665ad7adf +Author: Devin Matthews +Date: Fri Jul 22 16:10:30 2016 -0500 + + Git rid of one RBX update. + +commit 451bde076f0320d60cd2475cfb048ac4a2b798bb +Author: Devin Matthews +Date: Fri Jul 22 15:43:00 2016 -0500 + + Add some more knobs to twiddle for KNL microkernel. + +commit 8c6e621c099521e7a4d87e007bb8224faa5f33a3 +Author: Devin Matthews +Date: Fri Jul 22 15:05:15 2016 -0500 + + Make knl conform to new kernel dir structure. + +commit ce7214c6618d6f22f4ce2ee452336236916d1f30 +Merge: 119d0399 ce59f811 +Author: Devin Matthews +Date: Fri Jul 22 14:59:53 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + commit ce59f81108ec9aea918a7e77030da8acfdd397ce -Merge: ff41153 707a2b7 +Merge: ff41153f 707a2b7f Author: Field G. Van Zee Date: Fri Jul 22 14:48:14 2016 -0500 @@ -412,7 +1256,7 @@ Date: Fri Jul 22 13:49:44 2016 -0500 Somehow forgot the most important microkernel. commit 47ec045056351ac4f0791c071fa0daaa81699c8c -Merge: 08f1d6b ff41153 +Merge: 08f1d6b6 ff41153f Author: Devin Matthews Date: Fri Jul 22 13:45:23 2016 -0500 @@ -425,7 +1269,7 @@ Date: Fri Jul 22 13:44:37 2016 -0500 Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 -Merge: f9214ce e0d2fa0 +Merge: f9214ced e0d2fa0d Author: Field G. Van Zee Date: Fri Jul 22 13:21:03 2016 -0500 @@ -440,7 +1284,7 @@ Date: Fri Jul 22 12:56:51 2016 -0500 Relax alignment restrictions for haswell sgemm. commit f9214ced97392861f5a0ea72abfcf6f41faf674c -Merge: 413d62a 08666ea +Merge: 413d62ac 08666eaa Author: Field G. Van Zee Date: Fri Jul 22 12:16:39 2016 -0500 @@ -460,8 +1304,26 @@ Date: Fri Jul 22 11:07:34 2016 -0500 Change -openmp to -fopenmp for icc. +commit 119d0399428905053265f3aca1cc8cc1fde3b363 +Author: Devin Matthews +Date: Fri Jul 22 10:23:31 2016 -0500 + + Add 8x24 KNL kernel. + +commit b58cda9eba0c1e175460aae109baf792d29ba5bf +Merge: 318f063d 413d62ac +Author: Devin Matthews +Date: Tue Jul 19 14:09:09 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + + # Conflicts: + # frame/base/bli_threading.h + # frame/include/blis.h + # frame/thread/bli_thread.c + commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 -Merge: 31def12 413d62a +Merge: 31def12e 413d62ac Author: Field G. Van Zee Date: Thu Jul 14 11:01:06 2016 -0500 @@ -559,6 +1421,12 @@ Date: Fri Jun 17 14:08:35 2016 -0500 but possible divide-by-zero. - Updated function signature and prototype formatting in testsuite. +commit 318f063dcbd8b594969e401bc99146d24b01066a +Author: Devin Matthews +Date: Wed Jun 8 17:46:50 2016 -0500 + + Add new KNL microkernel derived from Haswell. + commit 096895c5d538a7f8817603d7cf28c52e99340def Author: Field G. Van Zee Date: Mon Jun 6 13:32:04 2016 -0500 @@ -592,7 +1460,7 @@ Date: Mon Jun 6 13:32:04 2016 -0500 in the wrong order, which was recently fixed. commit 232530e88ff99f37abcae5b6fb5319a9a375a45f -Merge: 4bcabd1 eef37f8 +Merge: 4bcabd1b eef37f8b Author: Tyler Michael Smith Date: Wed Jun 1 15:14:10 2016 -0500 @@ -700,6 +1568,18 @@ Date: Tue May 17 15:20:16 2016 -0500 store the unrolled 30xk kernel in the array for use (on knc, for example). Note: This should have been done a long time ago. +commit e3bd5ca64ae7c190ba689396c0de687b829a11fe +Author: Devin Matthews +Date: Thu May 12 20:54:13 2016 -0500 + + Fix SIMD definitions in KNL config, and a couple of fixes to C update. + +commit 4fe02e3d497995d94d34d3fcf5af895084cfc8b9 +Author: Devin Matthews +Date: Thu May 12 20:53:58 2016 -0500 + + Move bli_kernel.h before bli_threading.h in order of inclusion in blis.h. + commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 Author: Field G. Van Zee Date: Wed May 11 16:09:49 2016 -0500 @@ -727,7 +1607,7 @@ Date: Wed May 11 16:02:30 2016 -0500 #includes an "f2c.h" header. commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 -Merge: 4dcd37e 7c604e1 +Merge: 4dcd37eb 7c604e1c Author: Tyler Michael Smith Date: Wed May 11 10:47:11 2016 -0500 @@ -741,14 +1621,28 @@ Date: Tue May 10 16:28:59 2016 -0500 fixing knc simd align size +commit 619dee0daec3474b4e5a55df90a61aabcae194f2 +Merge: b790b3d9 7c604e1c +Author: Devin Matthews +Date: Tue May 10 12:13:24 2016 -0500 + + Merge branch 'move_simd_defs' into knl + commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 Author: Devin Matthews Date: Tue May 10 12:11:55 2016 -0500 Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. +commit b790b3d9e1820f3b691676de48c291cae083452d +Merge: 4f8c05c9 a7be2d28 +Author: Devin Matthews +Date: Tue May 10 11:49:47 2016 -0500 + + Merge branch 'master' into knl + commit a7be2d28e8930b154d0da1d6929b54a96e210af6 -Merge: 97b512e 4b1e55e +Merge: 97b512ef 4b1e55ed Author: Field G. Van Zee Date: Tue May 10 11:48:51 2016 -0500 @@ -840,7 +1734,7 @@ Date: Wed Apr 27 14:13:46 2016 -0500 bdbda6e, to tabs. commit 4ea419c72c789825e1f93a1eee88219bbf873930 -Merge: f1e9be2 bdbda6e +Merge: f1e9be2a bdbda6e6 Author: Field G. Van Zee Date: Tue Apr 26 12:50:45 2016 -0500 @@ -870,7 +1764,7 @@ Date: Fri Apr 22 15:34:02 2016 -0500 in my local working copy for longer than I can remember. commit aa0bceec277938328dabeb744680623f24fb0b61 -Merge: 4136553 e2784b4 +Merge: 4136553f e2784b4c Author: Field G. Van Zee Date: Fri Apr 22 12:01:31 2016 -0500 @@ -890,8 +1784,14 @@ Date: Fri Apr 22 11:53:53 2016 -0500 - Changed the definition of bli_cntx_obj_clear() so that the clearing occurs via a single call to memset(). +commit 4f8c05c9e2ef4cbb82b35a3ebf1f0a0ac665830e +Author: Devin Matthews +Date: Thu Apr 21 10:00:59 2016 -0500 + + Rearrange KNL dgemm kernel again to streamline usage of ymm register. sgemm and dgemm now both working with Intel SDE. + commit e2784b4c921f706e756df3e146e20a4cb63f53e3 -Merge: dd0ab1d a9b6c3a +Merge: dd0ab1d9 a9b6c3ab Author: Field G. Van Zee Date: Wed Apr 20 18:34:09 2016 -0500 @@ -900,7 +1800,7 @@ Date: Wed Apr 20 18:34:09 2016 -0500 Change CBLAS integer type to f77_int commit a9b6c3abda6222a8b240361643932e83cf726c4f -Merge: e4c54c8 dd0ab1d +Merge: e4c54c81 dd0ab1d9 Author: Devin Matthews Date: Wed Apr 20 16:00:10 2016 -0500 @@ -927,8 +1827,14 @@ Date: Wed Apr 20 14:38:23 2016 -0500 added equivalent cpp query macros to bli_cntx.h. - Added 'bli_config.h' to .gitignore. +commit 7193230f7d35edbd1d2f77842a613971f1603463 +Author: Devin Matthews +Date: Wed Apr 20 09:37:30 2016 -0500 + + Work around missing VPMULLQ on KNL. + commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb -Merge: eb2f18e 0e1a982 +Merge: eb2f18e4 0e1a9821 Author: Field G. Van Zee Date: Tue Apr 19 15:04:33 2016 -0500 @@ -936,6 +1842,12 @@ Date: Tue Apr 19 15:04:33 2016 -0500 Add configure options and generate bli_config.h automatically. +commit bd44cf13e886069bc66c10ac0db178be96629a0d +Author: Devin Matthews +Date: Tue Apr 19 13:43:04 2016 -0500 + + Fix copy-paste errors in KNL kernels. + commit eb2f18e4844d985715df20798f50f9cc12e3b5ad Author: Field G. Van Zee Date: Tue Apr 19 12:50:32 2016 -0500 @@ -956,18 +1868,56 @@ Date: Tue Apr 19 11:44:37 2016 -0500 Lastly, support for OMP in clang has been added (closes #56). +commit a11eec05928ddc5c43fa5dbcd35f2edd24ff35a1 +Author: Devin Matthews +Date: Mon Apr 18 13:13:36 2016 -0500 + + Add sgemm ukernels for KNL. vpmullq is not implemented on KNL -- needs workaround. + commit ff84469a4575f1ef8a0010046fde52240a312cae Author: Field G. Van Zee Date: Mon Apr 18 12:29:09 2016 -0500 Applied various compilation fixes to bgq kernels. +commit c38e0dab05b2dc36672eab96e1248fb7fb2d785b +Merge: bd5e2296 cbcd0b73 +Author: Devin Matthews +Date: Mon Apr 18 10:21:35 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + +commit bd5e2296e98e042c31f1e8ece2c1ca8e4bdc2d4c +Merge: 4745def0 49f85177 +Author: Devin Matthews +Date: Mon Apr 18 10:15:22 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + +commit 4745def0c87377ae83ad73ac514d7de08a96b2ac +Author: Devin Matthews +Date: Mon Apr 18 10:15:05 2016 -0500 + + Add 64-bit offset vector so we can use vgatherqpd. + +commit 49f85177f886f38889b60503a4e12fa7f04be1fd +Author: Devin Matthews +Date: Mon Apr 18 10:14:11 2016 -0500 + + KNL ukernel compiles with gcc. + commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f Author: Tyler Michael Smith Date: Mon Apr 18 03:12:57 2016 -0500 Changing ifdef for OSX pthread barriers +commit 58b2c3cf040134d1be913c585a3c6905629116c0 +Author: Devin Matthews +Date: Sat Apr 16 16:12:24 2016 -0500 + + Rewrite of KNL kernel in GNU extended asm syntax. + commit dd62080cea78f3a23616200d6640e52c102b2bb9 Author: Field G. Van Zee Date: Fri Apr 15 11:15:41 2016 -0500 @@ -984,7 +1934,7 @@ Date: Fri Apr 15 11:15:41 2016 -0500 website. commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a -Merge: 4320b72 4169467 +Merge: 4320b725 41694675 Author: Field G. Van Zee Date: Thu Apr 14 12:56:36 2016 -0500 @@ -1182,8 +2132,34 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. +commit dd856c2cb75a2221a503a73dde27790c34b91570 +Author: Devin Matthews +Date: Mon Apr 11 10:39:18 2016 -0500 + + Translated MIC kernel to KNL and cleaned up a bit. Only real change is lack of swizzle modifiers for FMA instructions (used bcast from memory instead). + +commit 7f27431d3fffdda99c282ec412731d0a90cb32a7 +Author: Devin Matthews +Date: Fri Apr 8 10:04:39 2016 -0500 + + Copy mic kernel to knl for transliteration. + +commit f8f02f0334ac020021e15a415bcd33aeea01deb4 +Merge: 32c92d94 d1f8e5d9 +Author: Devin Matthews +Date: Wed Apr 6 11:37:05 2016 -0500 + + Merge branch 'master' into const_correctness + +commit 32c92d945c55708da0eb63be1771f8c5430e3910 +Merge: 62914ccb 20af937b +Author: Devin Matthews +Date: Wed Apr 6 11:36:02 2016 -0500 + + Merge branch 'master' into const_correctness + commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 -Merge: 20af937 c11d28e +Merge: 20af937b c11d28ee Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -1198,7 +2174,7 @@ Date: Sat Apr 2 21:15:48 2016 +0200 cgemm µkernel for bulldozer : bug correction for k%4 != 0 commit 20af937b57f82bb3acb09418d5c0206e1b24f2c7 -Merge: 36c3abb fc61a11 +Merge: 36c3abb0 fc61a114 Author: Field G. Van Zee Date: Thu Mar 31 14:37:30 2016 -0500 @@ -1219,7 +2195,7 @@ Date: Thu Mar 31 10:45:48 2016 -0500 Adjust paths in common.mk to support building from testsuite dir. commit 36c3abb05fecb02d4a9ab13b2b69d133adf34583 -Merge: 64b41fa 917ce75 +Merge: 64b41fa5 917ce754 Author: Field G. Van Zee Date: Thu Mar 31 10:26:17 2016 -0500 @@ -1245,8 +2221,15 @@ Date: Wed Mar 30 22:03:09 2016 +0200 cgemm & zgemm micro-kernels for FMA4 instruction set (bulldozer configuration), based on x86_64/avx micro-kernel +commit 62914ccbcdb3c594f065dcfa65bd7e7b95c79283 +Merge: bbf704bf 64b41fa5 +Author: Devin Matthews +Date: Tue Mar 29 15:24:25 2016 -0500 + + Merge branch 'master' into const_correctness + commit 64b41fa554dff44b2f9ad48901b67c63836407a8 -Merge: 1b09e34 0171ad5 +Merge: 1b09e343 0171ad58 Author: Field G. Van Zee Date: Tue Mar 29 15:19:41 2016 -0500 @@ -1267,7 +2250,7 @@ Date: Mon Mar 28 13:55:06 2016 -0500 Add icc and clang support for Intel architectures, fixes #47. 2bd036f fixes #49 BTW. commit 3090fff64cc87ff2519a09f38e6b8699cf3cba11 -Merge: 8624e36 4ca5d5b +Merge: 8624e365 4ca5d5b1 Author: Field G. Van Zee Date: Mon Mar 28 12:36:25 2016 -0500 @@ -1276,14 +2259,14 @@ Date: Mon Mar 28 12:36:25 2016 -0500 sgemm micro-kernel for FMA4 instruction set commit e6e566426ac3ded7ef87cd8ff9be98accfdc4acc -Merge: 469429e 8624e36 +Merge: 469429ec 8624e365 Author: Devin Matthews Date: Sat Mar 26 14:10:15 2016 -0500 Merge branch 'master' into more_config_opts commit 8624e36543160739d954c4dbcc5a5594458f3a12 -Merge: a315833 2bd036f +Merge: a315833f 2bd036f1 Author: Field G. Van Zee Date: Sat Mar 26 13:56:28 2016 -0500 @@ -1310,7 +2293,7 @@ Date: Fri Mar 25 17:22:58 2016 -0500 Add threading option to configure. commit ad43eab4c7899d56d8d7caa6e2d92bc0581ea5a5 -Merge: 9452bdb 2bd036f +Merge: 9452bdb3 2bd036f1 Author: Devin Matthews Date: Fri Mar 25 15:00:02 2016 -0500 @@ -1328,8 +2311,14 @@ Date: Fri Mar 25 12:16:49 2016 -0500 Fix configuration issue where instruction set flags are not specified for debug builds. +commit bbf704bf7501411964a63a68f1af541f612cf92d +Author: Devin Matthews +Date: Fri Mar 25 09:55:35 2016 -0500 + + Add missing const to bli_read_nway_from_env. + commit a315833f067944fb0bc14cf60f0c7dcb5dc897b6 -Merge: 1d1a426 af92773 +Merge: 1d1a426d af92773f Author: Field G. Van Zee Date: Thu Mar 24 12:30:21 2016 -0500 @@ -1343,8 +2332,20 @@ Date: Wed Mar 23 22:07:02 2016 +0100 Updated and improved ARMv8 micro-kernels. +commit a4d7729776d17d9bdf2341eacd70b9770b9ba8d2 +Author: Devin Matthews +Date: Mon Mar 21 09:55:21 2016 -0500 + + Set default value for debug_type variable. + +commit 0e2447fa55d8c5fa2b1fc4150073512495c5f9eb +Author: Devin Matthews +Date: Thu Mar 17 16:32:05 2016 -0500 + + Add const correctness to auxinfo_t struct (microkernels need update theoretically). + commit 1d1a426d18ec03754021456862a1f4d1dfec1fbf -Merge: 5a978ff d226dfa +Merge: 5a978fff d226dfa0 Author: Field G. Van Zee Date: Mon Mar 7 15:17:53 2016 -0600 @@ -1364,7 +2365,7 @@ Date: Sat Mar 5 16:18:14 2016 -0600 4) Add make V=[0,1] option to control build verbosity. commit 5a978fffdb8f09a81c89541d541d4a6830cd70a4 -Merge: adb2b4e 63e2642 +Merge: adb2b4e0 63e26423 Author: Field G. Van Zee Date: Fri Mar 4 17:26:58 2016 -0600 @@ -1409,7 +2410,7 @@ Date: Mon Feb 29 21:53:12 2016 +0100 symbolic link for bulldozer configuration to kernels commit 2dc5c0ae038ed175fab85751803ada05734d1ba1 -Merge: f2809fc 3d0fae8 +Merge: f2809fc5 3d0fae81 Author: Field G. Van Zee Date: Mon Feb 29 12:22:51 2016 -0600 @@ -1418,7 +2419,7 @@ Date: Mon Feb 29 12:22:51 2016 -0600 Add symlink from config/bulldozer/kernels to kernels/x86_64/bulldozer commit f2809fc5f74466c755da6a5b4632853e634060b5 -Merge: f86b94f 8624a33 +Merge: f86b94f2 8624a33c Author: Field G. Van Zee Date: Sat Feb 27 13:06:03 2016 -0600 @@ -1542,7 +2543,7 @@ Date: Tue Nov 3 10:30:08 2015 -0600 smart enough to perform this optimization automatically. commit 0694b722f7e4df00efb32639095a2aca80e67f52 -Merge: 3e116f0 33557ec +Merge: 3e116f0a 33557ecc Author: Field G. Van Zee Date: Mon Nov 2 17:24:25 2015 -0600 @@ -1621,7 +2622,7 @@ Date: Fri Oct 30 18:25:04 2015 -0500 micro-kernels, and trsm_ll macro-kernel. commit 46294d80e5a79c598e200e1c8ec2a642ff839971 -Merge: d3159c5 a0a7b85 +Merge: d3159c57 a0a7b85a Author: Field G. Van Zee Date: Tue Oct 27 12:41:23 2015 -0500 @@ -1636,7 +2637,7 @@ Date: Tue Oct 27 08:59:15 2015 +0000 Fixed incomplete code in the double precision ARMv8 microkernel. commit d3159c5740c9ee7f8c0b661003aab6f00646ad6f -Merge: b489152 7e03e45 +Merge: b489152e 7e03e45b Author: Field G. Van Zee Date: Wed Oct 21 14:54:00 2015 -0500 @@ -1649,7 +2650,7 @@ Date: Wed Oct 21 14:53:17 2015 -0500 Use vzeroall in haswell micro-kernels. commit 7e03e45bfe6c27c4fdbf06b1caa7f49e9a5fef49 -Merge: 77ddb0b 4f88c29 +Merge: 77ddb0b1 4f88c29f Author: Field G. Van Zee Date: Wed Oct 14 13:26:07 2015 -0500 @@ -1664,7 +2665,7 @@ Date: Wed Oct 14 12:57:50 2015 -0500 Detect Intel Broadwell (using Haswell config). commit 4b0ac1a9984a93f7ad4369b10fca63991107d9f5 -Merge: fe3e355 77ddb0b +Merge: fe3e355c 77ddb0b1 Author: Zhang Xianyi Date: Wed Oct 14 12:51:05 2015 -0500 @@ -1771,7 +2772,7 @@ Date: Thu Sep 24 12:14:03 2015 -0500 bli_obj_row_off(), bli_obj_col_off(). commit fe3e355c9c5a6f65b8736b009e2d501b62a83ea1 -Merge: efa641e 4dd9dd3 +Merge: efa641e3 4dd9dd3e Author: Zhang Xianyi Date: Fri Aug 21 14:38:36 2015 -0500 @@ -1817,7 +2818,7 @@ Date: Wed Jul 29 13:31:09 2015 -0500 Version file update (0.1.8) commit ef0fbbbdb6148b96938733fce72cb4ed7dad685e -Merge: fdfe14f d4b8913 +Merge: fdfe14f1 d4b89136 Author: Field G. Van Zee Date: Thu Jul 9 13:54:54 2015 -0500 @@ -2085,7 +3086,7 @@ Date: Fri Apr 3 16:44:32 2015 -0500 - Added ACML support to test/3m4m driver Makefile and runme.sh script. commit a32f7c49ca4ea869d2a6c66818780f4321743d67 -Merge: 349e075 4bfd1ce +Merge: 349e075a 4bfd1ce8 Author: Field G. Van Zee Date: Fri Apr 3 08:28:11 2015 -0500 @@ -2279,7 +3280,7 @@ Date: Fri Feb 20 15:24:27 2015 -0600 return blocksizes from one of the induced methods' blocksize objects. commit 411e637ee7d1083a84f58f08938d51e63d7c3c9a -Merge: c2569b8 fc0b771 +Merge: c2569b88 fc0b7712 Author: Tyler Michael Smith Date: Fri Feb 20 20:39:25 2015 -0600 @@ -2345,14 +3346,14 @@ Date: Thu Feb 19 14:27:09 2015 -0600 the sandybridge configuration. commit 493087d730f01d5169434f461644e5633f48a42f -Merge: 650d2a6 2502129 +Merge: 650d2a6f 25021299 Author: Field G. Van Zee Date: Wed Feb 18 09:45:51 2015 -0600 Merge branch 'master' of github.com:flame/blis commit 25021299b670775df8ca9c87910c63d7e74ed946 -Merge: fe2b8d3 f05a576 +Merge: fe2b8d39 f05a5763 Author: Field G. Van Zee Date: Wed Feb 11 20:03:21 2015 -0600 @@ -2487,7 +3488,7 @@ Date: Tue Dec 16 11:27:50 2014 -0600 Added 4m_1b to test/3m4m test driver and script. commit 785d480805fc0d6f4251b5499933515740b6b2a7 -Merge: 9456f33 4156c08 +Merge: 9456f330 4156c088 Author: Field G. Van Zee Date: Fri Dec 12 14:34:19 2014 -0600 @@ -2539,7 +3540,7 @@ Date: Tue Dec 9 16:03:14 2014 -0600 leading us to this bug. commit 689f60a578b461119e9ea90c74f642b9eb79addb -Merge: bef24e6 483e4d6 +Merge: bef24e67 483e4d6a Author: Field G. Van Zee Date: Sun Dec 7 14:03:30 2014 -0600 @@ -2565,7 +3566,7 @@ Date: Wed Nov 26 18:00:56 2014 -0600 Barriers were inserted to fix this. commit 76bde44411f0e34266bab9d666a54ef22be97320 -Merge: e56e614 f3d729e +Merge: e56e6143 f3d729e5 Author: Field G. Van Zee Date: Wed Nov 26 17:25:24 2014 -0600 @@ -2610,7 +3611,7 @@ Date: Fri Nov 21 12:28:08 2014 -0600 - Updated comments on alignment of a1 and b1 to match wiki. commit 994429c6881b2ade92d9d7949bcaebfbf2cc65eb -Merge: 58796ab 694029d +Merge: 58796abd 694029d9 Author: Field G. Van Zee Date: Thu Nov 20 13:55:35 2014 -0600 @@ -2857,7 +3858,7 @@ Date: Fri Oct 10 10:01:45 2014 -0500 - Updated sandybridge configuration accordingly. commit 23ce7ee542a12ca40b4b6090ad2558d180e16d37 -Merge: 99fd9a3 7a8ad47 +Merge: 99fd9a39 7a8ad47f Author: Field G. Van Zee Date: Thu Oct 9 16:41:22 2014 -0500 @@ -2918,7 +3919,7 @@ Date: Mon Sep 29 14:56:36 2014 -0500 Fixed bug when packing anywhere besides in blk_var_1 for gemm. commit 614a4afc9272adb47e5a8b83b39d56c2804d95d6 -Merge: b541b66 4a7df04 +Merge: b541b667 4a7df04e Author: Tyler Smith Date: Fri Sep 26 10:49:57 2014 -0500 @@ -3008,7 +4009,7 @@ Date: Wed Sep 17 11:10:07 2014 -0500 implementations. Thanks to Devin Matthews for reporting this bug. commit 870761eb902e4866090d1d3446a345df3d6d4599 -Merge: e9899be a2b59a3 +Merge: e9899be0 a2b59a37 Author: Field G. Van Zee Date: Tue Sep 16 18:20:49 2014 -0500 @@ -3304,7 +4305,7 @@ Date: Thu Aug 28 11:55:12 2014 -0500 we now pass in the pack schema itself. commit a0ff6066e06075ab5f92b19247b39b92ed15f1bf -Merge: c4c99c4 d40b32b +Merge: c4c99c48 d40b32bc Author: Field G. Van Zee Date: Sun Aug 24 15:56:21 2014 -0500 @@ -3325,7 +4326,7 @@ Date: Sun Aug 24 15:52:22 2014 -0500 level-2 or level-3 operation. commit d40b32bc24ffbae24123e054307b3138969bb095 -Merge: 9331f79 6c25c37 +Merge: 9331f794 6c25c379 Author: Field G. Van Zee Date: Sun Aug 24 13:46:36 2014 -0500 @@ -3343,7 +4344,7 @@ Date: Sun Aug 24 13:44:10 2014 -0500 ukernels in commit 4cc2b46. commit 9331f79443223fe267676ee54c439e1ed320380c -Merge: 7fc48a7 670b639 +Merge: 7fc48a7d 670b6392 Author: Field G. Van Zee Date: Sun Aug 24 10:54:21 2014 -0500 @@ -3427,7 +4428,7 @@ Date: Thu Aug 21 18:25:48 2014 -0500 those blocksizes at runtime. commit b541b667cabfa6d41b50ad1e49209651ee6812cc -Merge: 699a815 dd61307 +Merge: 699a8151 dd61307f Author: Tyler Smith Date: Wed Aug 20 14:44:51 2014 -0500 @@ -3654,7 +4655,7 @@ Date: Mon Aug 4 15:49:59 2014 -0500 - Updated blis.h to include necessary CBLAS-related headers. commit caab62dac0fb0bd0d674118f409c81680db94d29 -Merge: 383631b db97ce9 +Merge: 383631b5 db97ce97 Author: Field G. Van Zee Date: Sun Aug 3 14:36:18 2014 -0500 @@ -3779,7 +4780,7 @@ Date: Sun Jul 27 18:20:12 2014 -0500 Version file update (0.1.4) commit acff74041bf02c7b9fdfa24b507bca782a4c5fce -Merge: cdb9413 47b243e +Merge: cdb9413e 47b243ef Author: Tyler Smith Date: Wed Jul 23 15:07:30 2014 -0500 @@ -3807,7 +4808,7 @@ Date: Wed Jul 23 13:41:13 2014 -0500 - Comment update. commit 3e7b0db5b0e24f5fd66c60bacabc019885ddbec5 -Merge: 2f8a357 ed3e33d +Merge: 2f8a357d ed3e33d5 Author: Tyler Smith Date: Wed Jul 23 13:40:44 2014 -0500 @@ -3853,7 +4854,7 @@ Date: Tue Jul 22 14:36:02 2014 -0500 matrix real-valued. commit 8965a965931318619ceaebd7c32edccf3022d0c7 -Merge: 1785efb 5b73e80 +Merge: 1785efb5 5b73e80b Author: Field G. Van Zee Date: Tue Jul 22 14:34:32 2014 -0500 @@ -3870,7 +4871,7 @@ Date: Tue Jul 22 14:33:01 2014 -0500 - Changed setd front-end call of scald_check() to setd_check(). commit 5b73e80b71c054c1945a06aff044ef629bc1a9a0 -Merge: a41e68e 20690fe +Merge: a41e68e0 20690fe3 Author: Field G. Van Zee Date: Fri Jul 18 12:21:20 2014 -0500 @@ -3942,7 +4943,7 @@ Date: Mon Jul 14 16:05:03 2014 -0500 2012). commit fcec68cda3f6e90ae055e7304e6674c1c5c8d010 -Merge: 94c0df7 4a20ed1 +Merge: 94c0df79 4a20ed1a Author: Field G. Van Zee Date: Mon Jul 14 11:35:34 2014 -0500 @@ -3977,7 +4978,7 @@ Date: Sun Jul 13 22:50:56 2014 -0700 Emscripten port commit 4a20ed1a3f5e9e5232df30aa0e568e6c00c56ce1 -Merge: 6a515e9 8ccdfae +Merge: 6a515e98 8ccdfaef Author: Field G. Van Zee Date: Sun Jul 13 17:45:01 2014 -0500 @@ -4076,7 +5077,7 @@ Date: Tue Jul 8 10:25:27 2014 -0500 - Added *.so files to '.gitignore'. commit 6c65e9a58fe55990ebb99ec3986443e18af35338 -Merge: cb12e45 daca500 +Merge: cb12e456 daca500d Author: Field G. Van Zee Date: Tue Jul 8 10:13:49 2014 -0500 @@ -4095,7 +5096,7 @@ Date: Tue Jul 8 10:07:46 2014 -0500 uninitialized. Thanks to Tony Kelman for isolating this bug. commit daca500db5e2448ba0da8047b75eb0f88d9f40e3 -Merge: ab3bc91 4702350 +Merge: ab3bc915 47023502 Author: Tyler Smith Date: Thu Jul 3 12:52:52 2014 -0500 @@ -4200,7 +5201,7 @@ Date: Mon Jun 23 10:42:29 2014 -0500 Removed 'version' from .gitignore file. commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 -Merge: 7101a8e b693b0c +Merge: 7101a8ee b693b0cd Author: Field G. Van Zee Date: Mon Jun 23 10:39:05 2014 -0500 @@ -4215,7 +5216,7 @@ Date: Sun Jun 22 13:44:25 2014 -0700 [SC]AXPY kernels for PNaCl commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162 -Merge: ad48dca 020a831 +Merge: ad48dca2 020a831b Author: Field G. Van Zee Date: Thu Jun 19 21:46:50 2014 -0500 @@ -4278,7 +5279,7 @@ Date: Sun Jun 15 06:27:37 2014 -0400 SGEMM and DGEMM kernels for PNaCl commit ad48dca22913a363899f0bef45553898718eebb1 -Merge: ee2b679 7118f87 +Merge: ee2b6792 7118f87e Author: Field G. Van Zee Date: Sat Jun 14 15:10:13 2014 -0500 @@ -4327,7 +5328,7 @@ Date: Wed May 21 11:34:42 2014 -0500 reporting this bug. commit 77a2d8dac8b242d7a202c9aabda3927ab68cf987 -Merge: 8c5d607 21fb089 +Merge: 8c5d6071 21fb0893 Author: Field G. Van Zee Date: Tue May 20 09:53:19 2014 -0500 @@ -4395,7 +5396,7 @@ Date: Wed Apr 30 12:28:00 2014 -0500 Replaced register blocksize hack with querying the register blocksize for determining parallelism granularity commit f4fdfe8fc573553eb36795b79cdf681270dab71b -Merge: 31bb065 8c5d607 +Merge: 31bb065b 8c5d6071 Author: Tyler Smith Date: Wed Apr 30 11:46:35 2014 -0500 @@ -4435,7 +5436,7 @@ Date: Mon Apr 28 16:48:25 2014 -0500 to Jack Poulson for reporting this bug. commit 31bb065ba40ae0c5a614e743b8025abca012b99e -Merge: 20e2443 7c61959 +Merge: 20e24430 7c619599 Author: Tyler Smith Date: Wed Apr 23 12:30:19 2014 -0500 @@ -4535,7 +5536,7 @@ Date: Fri Apr 4 10:22:48 2014 -0500 Also made herk IC and JC loops do weighted partitioning commit 2b6848b2397d6d84ca4e5f792fc51ad05e351a36 -Merge: 4e3eb39 21a0efb +Merge: 4e3eb39a 21a0efb3 Author: Tyler Smith Date: Fri Apr 4 09:54:54 2014 -0500 @@ -4654,7 +5655,7 @@ Date: Mon Mar 24 15:21:42 2014 -0500 a_next and b_next point to the current micropanels in trmm commit 23d9eab354fbc88165889832955e126772bf8488 -Merge: 5d5dc2e fd3e32a +Merge: 5d5dc2ee fd3e32a5 Author: Tyler Smith Date: Thu Mar 20 16:54:35 2014 -0500 @@ -4796,7 +5797,7 @@ Date: Mon Mar 10 15:47:28 2014 -0500 Added single threaded thread info data structures specifically for gemm and packm commit 0e8677761175189583ca7d855e24b2bbdd2dada8 -Merge: 2e727a0 b3bff63 +Merge: 2e727a02 b3bff631 Author: Tyler Smith Date: Mon Mar 10 15:16:21 2014 -0500 @@ -4829,14 +5830,14 @@ Date: Mon Mar 3 14:31:44 2014 -0600 are currently implemented in terms of isinf() and isnan() from math.h. commit b3bff631eadf98b15cb422fb4a8e2f855c23e8a7 -Merge: 2c158fb e8757b0 +Merge: 2c158fb8 e8757b03 Author: Tyler Smith Date: Thu Feb 27 16:53:24 2014 -0600 Merge https://github.com/flame/blis commit 2c158fb885c27f7b599dc1e85b57edd684f19223 -Merge: e4738c4 c2b2ab6 +Merge: e4738c48 c2b2ab62 Author: Tyler Smith Date: Thu Feb 27 16:46:23 2014 -0600 @@ -4896,7 +5897,7 @@ Date: Thu Feb 27 14:09:19 2014 -0600 Fixed bug in thread trees commit ac5a2de1d17ffd460b00fee9757898525a09abae -Merge: 01b125e bd3c7ec +Merge: 01b125e8 bd3c7ecf Author: Tyler Smith Date: Thu Feb 27 11:59:33 2014 -0600 @@ -4973,14 +5974,14 @@ Date: Tue Feb 25 13:34:56 2014 -0600 only the real gemm micro-kernel. commit 15b51e990f1d21333b5f7af97c211756247336e5 -Merge: 6363a9f fc04b5e +Merge: 6363a9f6 fc04b5eb Author: Field G. Van Zee Date: Fri Feb 21 09:04:32 2014 -0600 Merge branch 'master' of github.com:fgvanzee/blis commit fc04b5eb69868c341ce03f5ef1f02de4b8c121b0 -Merge: b29e1c2 d1813c9 +Merge: b29e1c2b d1813c9d Author: Field G. Van Zee Date: Fri Feb 21 09:04:13 2014 -0600 @@ -5023,7 +6024,7 @@ Date: Wed Feb 19 17:00:52 2014 -0600 - Various other minor changes to facilitate 4m/3m methods. commit b29e1c2b278c177e104c84ba462820ee8296df6c -Merge: ee60377 bd3c7ec +Merge: ee60377e bd3c7ecf Author: Field G. Van Zee Date: Fri Feb 14 14:11:54 2014 -0600 @@ -5676,7 +6677,7 @@ Date: Tue Dec 3 16:08:30 2013 -0600 beta are applied to the attached scalars. commit 992de486d6f23e69a623abd15ae77d7881d13871 -Merge: 9552e6e fd4ac63 +Merge: 9552e6ee fd4ac636 Author: Field G. Van Zee Date: Mon Dec 2 13:58:46 2013 -0600 @@ -5742,7 +6743,7 @@ Date: Mon Nov 18 18:11:07 2013 -0600 that already existed in kernels/x86_64/core2-sse3/3. commit 85e7e02ea3a9190b6fcff5d46b00d41c79cb1242 -Merge: 67761e2 7072005 +Merge: 67761e22 70720054 Author: Field G. Van Zee Date: Mon Nov 18 12:02:00 2013 -0600 @@ -6513,7 +7514,7 @@ Date: Thu Aug 1 11:24:23 2013 -0500 dimension of the gemm macro-kernel. commit f8980edf9c318453bb1962ac4939c06bf11e6d5e -Merge: 67a8b94 6e7e452 +Merge: 67a8b949 6e7e4523 Author: Field G. Van Zee Date: Fri Jul 26 11:14:27 2013 -0500 From 43007f7b65ec7926cbbfc39965ff733fa251c15f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:48:43 -0500 Subject: [PATCH 09/23] Fixed stray parentheses in README citations. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d7b0ce34..c40005221 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ for determining blocksize parameters in BLIS: ``` A fifth paper, submitted to ACM TOMS, begins the study of so-called -[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf): ``` @article{BLIS5, @@ -293,7 +293,7 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called ``` A sixth paper, submitted to ACM TOMS, revisits the topic of the previous -article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)): +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf): ``` @article{BLIS6, From 0df3541f54b7fe0c604ab2ec47ba814f12391798 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 2 May 2017 19:25:21 -0700 Subject: [PATCH 10/23] allow KNL build without hbwmalloc.h (i.e. emulated) we want to be able to run BLIS KNL binaries on non-KNL machines via SDE. although it is possible to install hbwmalloc implementation on such systems, it is easier not to, since obviously the performance of SDE execution is not representative so there is no reason to emulate HBW allocation. --- config/knl/bli_kernel.h | 11 +++++++++++ config/knl/make_defs.mk | 13 +++++++++++-- configure | 3 +++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/config/knl/bli_kernel.h b/config/knl/bli_kernel.h index e32954973..46b1cb4f4 100644 --- a/config/knl/bli_kernel.h +++ b/config/knl/bli_kernel.h @@ -43,11 +43,22 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 +#ifdef BLIS_NO_HBWMALLOC + +#include + +#define BLIS_MALLOC_POOL malloc +#define BLIS_FREE_POOL free + +#else + #include #define BLIS_MALLOC_POOL hbw_malloc #define BLIS_FREE_POOL hbw_free +#endif + //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 6a750223d..449aeb0bb 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -65,6 +65,10 @@ else COPTFLAGS := -O3 endif +ifeq ($(DEBUG_TYPE),sde) +CPPROCFLAGS += -DBLIS_NO_HBWMALLOC +endif + CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) @@ -95,12 +99,17 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -ifeq ($(CC_VENDOR),icc) + +ifneq ($(DEBUG_TYPE),sde) LDFLAGS := -lmemkind else -LDFLAGS := -lmemkind -lm +LDFLAGS := endif +ifneq ($(CC_VENDOR),icc) +LDFLAGS += -lm +endif + # end of ifndef MAKE_DEFS_MK_INCLUDED conditional block diff --git a/configure b/configure index 2358575f6..de7d1e96e 100755 --- a/configure +++ b/configure @@ -458,6 +458,9 @@ main() if [ -n "${debug_flag}" ]; then if [ "x${debug_type}" = "xopt" ]; then echo "${script_name}: enabling debug symbols with optimizations." + elif [ "x${debug_type}" = "xsde" ]; then + debug_type='sde' + echo "${script_name}: enabling SDE processor emulation." else debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." From dd58c9545c877c3f7553eaebca7b5e9720a66f5d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 3 May 2017 15:04:51 -0500 Subject: [PATCH 11/23] Disable complex 3m/4m in testsuite by default. Details: - Disabled testsuite tests of all level-3 implementations based on 3m and 4m. This will improve testing runtime on Travis CI as well as for anyone manually running the testsuite using default test parameters. Thanks to Devin Matthews for suggesting this change. --- testsuite/input.general | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/testsuite/input.general b/testsuite/input.general index 9dba50df6..b7fbd6b58 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -29,13 +29,13 @@ sdcz # Datatype(s) to test: 500 # Problem size: maximum to test 100 # Problem size: increment between experiments # Complex level-3 implementations to test -1 # 3mh ('1' = enable; '0' = disable) -1 # 3m3 ('1' = enable; '0' = disable) -1 # 3m2 ('1' = enable; '0' = disable) -1 # 3m1 ('1' = enable; '0' = disable) -1 # 4mh ('1' = enable; '0' = disable) -1 # 4m1b ('1' = enable; '0' = disable) -1 # 4m1a ('1' = enable; '0' = disable) +0 # 3mh ('1' = enable; '0' = disable) +0 # 3m3 ('1' = enable; '0' = disable) +0 # 3m2 ('1' = enable; '0' = disable) +0 # 3m1 ('1' = enable; '0' = disable) +0 # 4mh ('1' = enable; '0' = disable) +0 # 4m1b ('1' = enable; '0' = disable) +0 # 4m1a ('1' = enable; '0' = disable) 1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: From fdc66f12d40754ff46179804bff592fddafbca02 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 4 May 2017 10:35:22 -0500 Subject: [PATCH 12/23] Setting any one of BLIS_NT_[IJ][CR] overrides BLIS_NUM_THEADS. Missing BLIS_NT_XX's are defaulted to 1. Fixes #123. --- frame/base/bli_cntx.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index f8cdf1fc4..673987bfd 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -784,12 +784,20 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, } } - jc = bli_env_read_nway( "BLIS_JC_NT", jc ); - //pc = bli_env_read_nway( "BLIS_KC_NT", 1 ); - pc = 1; - ic = bli_env_read_nway( "BLIS_IC_NT", ic ); - jr = bli_env_read_nway( "BLIS_JR_NT", jr ); - ir = bli_env_read_nway( "BLIS_IR_NT", ir ); + pc = 1; + + dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + + if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) + { + jc = (jc_env == -1 ? 1 : jc_env); + ic = (ic_env == -1 ? 1 : ic_env); + jr = (jr_env == -1 ? 1 : jr_env); + ir = (ir_env == -1 ? 1 : ir_env); + } #else From cf39d3ef3b29b8058c39fb4638c1a734fe64aaed Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 5 May 2017 15:06:56 -0500 Subject: [PATCH 13/23] Fixed a bug in norm1v, norm1m. Details: - Fixed a bug that manifested as improperly-computed 1-norm for vectors and matrices. This is one of the few operations in BLIS that does not have its own test module within the testsuite, hence why it went undetected for so long. The bad 1-norms were being used to normalize matrices in the testsuite after initialization, which led to some matrices containing a combination of "large" and "small" values. This tended to push the residuals computed after each test away from zero. In some cases, they were off *just* enough to the testsuite to label it a "failure". Many thanks to Jeff Hammond for reporting this bug. (Wonky details: the bug was due to improperly-defined level-0 scalar macros for abval2, an operation that computes the absolute square, or complex magnitude/modulus. Certain complex domain instances of abval2 were being incorrectly defined in terms of real-only solutions, leading to bad results. This level-0 operation forms the basis of norm1v/norm1m. absq2 was also affected, but almost nothing uses this operation.) --- frame/include/level0/bli_absq2s.h | 32 +++++++++++++++--------------- frame/include/level0/bli_abval2s.h | 32 +++++++++++++++--------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/frame/include/level0/bli_absq2s.h b/frame/include/level0/bli_absq2s.h index b6d7766df..9dcdad06f 100644 --- a/frame/include/level0/bli_absq2s.h +++ b/frame/include/level0/bli_absq2s.h @@ -41,27 +41,27 @@ // - The first char encodes the type of x. // - The second char encodes the type of a. -#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabsq2s( x, a ) bli_sabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabsq2s( x, a ) bli_sabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabsq2s( x, a ) bli_sabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabsq2s( x, a ) bli_dabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabsq2s( x, a ) bli_dabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabsq2s( x, a ) bli_dabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_scabsq2s( x, a ) bli_cabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabsq2s( x, a ) bli_cabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabsq2s( x, a ) bli_cabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabsq2s( x, a ) bli_zabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabsq2s( x, a ) bli_zabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabsq2s( x, a ) bli_zabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX diff --git a/frame/include/level0/bli_abval2s.h b/frame/include/level0/bli_abval2s.h index 7e0556940..6e0480790 100644 --- a/frame/include/level0/bli_abval2s.h +++ b/frame/include/level0/bli_abval2s.h @@ -43,25 +43,25 @@ #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabval2s( x, a ) bli_sabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabval2s( x, a ) bli_sabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabval2s( x, a ) bli_sabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabval2s( x, a ) bli_dabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabval2s( x, a ) bli_dabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabval2s( x, a ) bli_dabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } -#define bli_scabval2s( x, a ) bli_cabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabval2s( x, a ) bli_cabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabval2s( x, a ) bli_cabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabval2s( x, a ) bli_zabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabval2s( x, a ) bli_zabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabval2s( x, a ) bli_zabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX From 5fa4e9439c04f35f89dd7d26ff742cb2dadc3180 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 21:50:49 -0400 Subject: [PATCH 14/23] A bunch of shebang fixes from unportable /bin/bash to portable /usr/bin/env bash --- build/auto-detect/auto-detect.sh | 2 +- build/bump-version.sh | 2 +- build/check-test.sh | 4 ++-- build/gen-make-frags/gen-make-frag.sh | 2 +- build/mirror-tree.sh | 2 +- build/templates/license.sh | 2 +- build/update-version-file.sh | 2 +- config/armv7a/make_defs.mk | 2 +- config/armv8a/make_defs.mk | 2 +- config/bgq/make_defs.mk | 2 +- config/bulldozer/make_defs.mk | 2 +- config/carrizo/make_defs.mk | 2 +- config/cortex-a15/make_defs.mk | 2 +- config/cortex-a9/make_defs.mk | 2 +- config/dunnington/make_defs.mk | 2 +- config/emscripten/make_defs.mk | 2 +- config/haswell/make_defs.mk | 2 +- config/knl/make_defs.mk | 2 +- config/loongson3a/make_defs.mk | 2 +- config/mic/make_defs.mk | 2 +- config/piledriver/make_defs.mk | 2 +- config/pnacl/make_defs.mk | 2 +- config/power7/make_defs.mk | 2 +- config/reference/make_defs.mk | 2 +- config/sandybridge/make_defs.mk | 2 +- config/template/make_defs.mk | 2 +- configure | 2 +- version | 2 +- 28 files changed, 29 insertions(+), 29 deletions(-) diff --git a/build/auto-detect/auto-detect.sh b/build/auto-detect/auto-detect.sh index 9300e3b8b..345fc2f3a 100755 --- a/build/auto-detect/auto-detect.sh +++ b/build/auto-detect/auto-detect.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/bump-version.sh b/build/bump-version.sh index 35da91b97..6df894152 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/check-test.sh b/build/check-test.sh index 6277ada45..fa7b4779a 100755 --- a/build/check-test.sh +++ b/build/check-test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,4 +47,4 @@ if [ $? -eq 0 ]; then else echo "Test Pass" exit 0 -fi \ No newline at end of file +fi diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index e24af3005..77e6dd5c4 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/mirror-tree.sh b/build/mirror-tree.sh index bac7ad9a6..3aae9ce35 100755 --- a/build/mirror-tree.sh +++ b/build/mirror-tree.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/templates/license.sh b/build/templates/license.sh index a9fc4b9fb..06da737b6 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/update-version-file.sh b/build/update-version-file.sh index afa829e4a..38e8d2088 100755 --- a/build/update-version-file.sh +++ b/build/update-version-file.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 2b4125f3a..af114c379 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 3dc88e913..be2e32667 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 0f405102b..dfc96dc6c 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 90d14d56b..097f33702 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index fd6b84cb0..121b6c5e0 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 52ab7a7c9..d38f60304 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 52ab7a7c9..d38f60304 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index f8faa3b5b..4234a4657 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 45b210ab6..63f4733cf 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 4c144846d..5e2d32641 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 449aeb0bb..0db039eb7 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 2c7e9c58c..21f6b084a 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 339112570..43d4a27ad 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index db46bd124..6d29705bc 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 9e2a3b4c5..e080b5c51 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index da4e5bff1..c2cb7b1ca 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 4e856534d..02076e95c 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 0a779b188..cd916739d 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 98f3222e0..c538a4c4d 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/configure b/configure index de7d1e96e..7aabc5b78 100755 --- a/configure +++ b/configure @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/version b/version index ee1372d33..7a8771f94 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.2 +0.2.1-115 From f5131e1e49167f948bddd714bb1af1761829c212 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:03:23 -0400 Subject: [PATCH 15/23] Indeed once can compile for carrizo also using clang. --- config/carrizo/make_defs.mk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 121b6c5e0..ef6435498 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -47,9 +47,12 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else $(error gcc is required for this configuration.) endif +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L From 91f897073ec0df3330ede449c4d6af8158266ae3 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:06:59 -0400 Subject: [PATCH 16/23] Correct error message. --- config/carrizo/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index ef6435498..76e74d67a 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -50,7 +50,7 @@ endif ifeq ($(CC_VENDOR),gcc) ifeq ($(CC_VENDOR),clang) else -$(error gcc is required for this configuration.) +$(error gcc or clang are required for this configuration.) endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). From 7541d46e2ba8659bb2e36b444edef112fefa1345 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:12:12 -0400 Subject: [PATCH 17/23] Mark bulldozer compilable w/ clang. --- config/bulldozer/make_defs.mk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 097f33702..b6fd06801 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -47,8 +47,11 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). From a75b05c23dc786a1fdc45dc1627a5ce2299f1a7b Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:23:27 -0400 Subject: [PATCH 18/23] Mark piledriver compilable w/ clang. --- config/piledriver/make_defs.mk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 6d29705bc..81b3ca2f1 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -47,8 +47,11 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). From 0579dfea0bcfbb90ebc073fcf78b92a5cf7238e1 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:58:07 -0400 Subject: [PATCH 19/23] Restore version. --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 7a8771f94..ee1372d33 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1-115 +0.2.2 From 169fb05f225c2f060265bcaa872f7f80dc638b70 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 23:11:22 -0400 Subject: [PATCH 20/23] Fix if/else structure. Thanks to TravisCI. --- config/bulldozer/make_defs.mk | 1 + config/carrizo/make_defs.mk | 1 + config/piledriver/make_defs.mk | 1 + 3 files changed, 3 insertions(+) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index b6fd06801..c6050504f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 76e74d67a..3e84b2011 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 81b3ca2f1..2160c0262 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) From 555ddc30d4c7e44f3f335e436c98606f56e1598b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 17 May 2017 12:27:14 -0500 Subject: [PATCH 21/23] Remove shebangs from makefiles. --- Makefile | 1 - build/config.mk.in | 1 - build/gen-make-frags/fragment.mk | 1 - common.mk | 1 - config/armv7a/make_defs.mk | 1 - config/armv8a/make_defs.mk | 1 - config/bgq/make_defs.mk | 1 - config/bulldozer/make_defs.mk | 1 - config/carrizo/make_defs.mk | 1 - config/cortex-a15/make_defs.mk | 1 - config/cortex-a9/make_defs.mk | 1 - config/dunnington/make_defs.mk | 1 - config/emscripten/make_defs.mk | 1 - config/haswell/make_defs.mk | 1 - config/knl/make_defs.mk | 1 - config/loongson3a/make_defs.mk | 1 - config/mic/make_defs.mk | 1 - config/piledriver/make_defs.mk | 1 - config/pnacl/make_defs.mk | 1 - config/power7/make_defs.mk | 1 - config/reference/make_defs.mk | 1 - config/sandybridge/make_defs.mk | 1 - config/template/make_defs.mk | 1 - mpi_test/Makefile | 1 - test/Makefile | 1 - testsuite/Makefile | 1 - 26 files changed, 26 deletions(-) diff --git a/Makefile b/Makefile index 1a4868eaa..0ad4d5c78 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/config.mk.in b/build/config.mk.in index 9d92f7fb4..fb4be778d 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/fragment.mk b/build/gen-make-frags/fragment.mk index 08773302b..17394f40b 100644 --- a/build/gen-make-frags/fragment.mk +++ b/build/gen-make-frags/fragment.mk @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/common.mk b/common.mk index 683d0b0e9..6f496c5da 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index af114c379..82e5c8c79 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index be2e32667..a5a9b577e 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index dfc96dc6c..879124ffa 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index c6050504f..b6b47e13f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 3e84b2011..63edee464 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index d38f60304..94f3a6e0c 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index d38f60304..94f3a6e0c 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 4234a4657..13c8c0ead 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 63f4733cf..bc99caee8 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 5e2d32641..185bd918c 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 0db039eb7..1e35adda7 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 21f6b084a..9e44684f2 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 43d4a27ad..311936979 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 2160c0262..45c6393b1 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e080b5c51..5375117b3 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index c2cb7b1ca..9d51e9db4 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 02076e95c..580d42d39 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index cd916739d..c0eed3b24 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index c538a4c4d..8bd574d3b 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/mpi_test/Makefile b/mpi_test/Makefile index 1bb965b4a..321b166d6 100644 --- a/mpi_test/Makefile +++ b/mpi_test/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/test/Makefile b/test/Makefile index 92b8c7df9..21bbf746d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/testsuite/Makefile b/testsuite/Makefile index 6a1954d8c..4ebece103 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like From ec5c0c0448275280dca0991f6f33afeb73650450 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 17 May 2017 12:29:44 -0500 Subject: [PATCH 22/23] Change to /bin/sh. All scripts checked with Debian's checkbashisms. Also check for clang first in auto-detect.sh. --- build/auto-detect/auto-detect.sh | 14 ++++++-------- build/bump-version.sh | 2 +- build/check-test.sh | 2 +- build/gen-make-frags/gen-make-frag.sh | 6 +----- build/mirror-tree.sh | 2 +- build/templates/license.sh | 2 +- build/update-version-file.sh | 2 +- 7 files changed, 12 insertions(+), 18 deletions(-) diff --git a/build/auto-detect/auto-detect.sh b/build/auto-detect/auto-detect.sh index 345fc2f3a..5185fd8af 100755 --- a/build/auto-detect/auto-detect.sh +++ b/build/auto-detect/auto-detect.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -41,7 +41,11 @@ main() { - CC=gcc + if [ clang -v > /dev/null 2>&1 ]; then + CC=clang + else + CC=gcc + fi CPUID_SRC=cpuid_x86.c CPUID_BIN=blis_cpu_detect ARCH=reference @@ -59,12 +63,6 @@ main() # of the distribution and the directory in which we are building. cur_dirpath="." - - OSNAME=`uname` - if [ $OSNAME = "Darwin" ]; then - CC=clang - fi - # # Detect architecture by predefined macros # diff --git a/build/bump-version.sh b/build/bump-version.sh index 6df894152..53cbe1825 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/check-test.sh b/build/check-test.sh index fa7b4779a..6fb082a4c 100755 --- a/build/check-test.sh +++ b/build/check-test.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index 77e6dd5c4..19fdc5bd0 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -374,10 +374,6 @@ gen_mkfiles() read_mkfile_config() { - local index lname - declare -i count - - # Read the file describing file suffixes. src_file_suffixes=$(cat "${suffix_file}") diff --git a/build/mirror-tree.sh b/build/mirror-tree.sh index 3aae9ce35..813091fcf 100755 --- a/build/mirror-tree.sh +++ b/build/mirror-tree.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/templates/license.sh b/build/templates/license.sh index 06da737b6..6105c1f04 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/update-version-file.sh b/build/update-version-file.sh index 38e8d2088..23373022d 100755 --- a/build/update-version-file.sh +++ b/build/update-version-file.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like From 6e04f9df01d79c1b0e673943ca0d5d0a6095eb2e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 17 May 2017 13:03:52 -0500 Subject: [PATCH 23/23] Restored deleted lines from makefile fragments. --- Makefile | 1 + build/config.mk.in | 1 + build/gen-make-frags/fragment.mk | 1 + common.mk | 1 + config/armv7a/make_defs.mk | 1 + config/armv8a/make_defs.mk | 1 + config/bgq/make_defs.mk | 1 + config/bulldozer/make_defs.mk | 1 + config/carrizo/make_defs.mk | 1 + config/cortex-a15/make_defs.mk | 1 + config/cortex-a9/make_defs.mk | 1 + config/dunnington/make_defs.mk | 1 + config/emscripten/make_defs.mk | 1 + config/haswell/make_defs.mk | 1 + config/knl/make_defs.mk | 1 + config/loongson3a/make_defs.mk | 1 + config/mic/make_defs.mk | 1 + config/piledriver/make_defs.mk | 1 + config/pnacl/make_defs.mk | 1 + config/power7/make_defs.mk | 1 + config/reference/make_defs.mk | 1 + config/sandybridge/make_defs.mk | 1 + config/template/make_defs.mk | 1 + mpi_test/Makefile | 1 + test/Makefile | 1 + testsuite/Makefile | 1 + 26 files changed, 26 insertions(+) diff --git a/Makefile b/Makefile index 0ad4d5c78..d74eba889 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/build/config.mk.in b/build/config.mk.in index fb4be778d..e7a3f3235 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/build/gen-make-frags/fragment.mk b/build/gen-make-frags/fragment.mk index 17394f40b..2a1eb6907 100644 --- a/build/gen-make-frags/fragment.mk +++ b/build/gen-make-frags/fragment.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/common.mk b/common.mk index 6f496c5da..08731d9aa 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 82e5c8c79..9d1b51d0a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index a5a9b577e..6d09af5cc 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 879124ffa..57c9899a0 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index b6b47e13f..0546a474f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 63edee464..f52d1dd67 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 94f3a6e0c..053e11cbb 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 94f3a6e0c..053e11cbb 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 13c8c0ead..8d07f2177 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index bc99caee8..4353d65cf 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 185bd918c..8c739607a 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 1e35adda7..104abafe2 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 9e44684f2..8fd9fb65a 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 311936979..8e7738b44 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 45c6393b1..b5c3f159c 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 5375117b3..c6f629ef8 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 9d51e9db4..765344f79 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 580d42d39..f75b9ec55 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index c0eed3b24..d91df8b68 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 8bd574d3b..d98452553 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/mpi_test/Makefile b/mpi_test/Makefile index 321b166d6..2d2df10b7 100644 --- a/mpi_test/Makefile +++ b/mpi_test/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/test/Makefile b/test/Makefile index 21bbf746d..1472ae4b5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/testsuite/Makefile b/testsuite/Makefile index 4ebece103..acbdd7bf3 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries.