From c2c91e09b4893cb81314774557f728a95080f81e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 25 Oct 2016 21:15:26 -0700 Subject: [PATCH 01/64] never use libm with Intel compilers Intel compilers include a highly optimized math library (libimf) that should be used instead of GNU libm. yes, this change is for ALL targets, including those that are not supported by the Intel compiler. there is no harm in doing this, and it is future-proof in the event that the Intel compilers support other architectures. --- config/armv7a/make_defs.mk | 2 ++ config/armv8a/make_defs.mk | 2 ++ config/bulldozer/make_defs.mk | 2 ++ config/carrizo/make_defs.mk | 2 ++ config/cortex-a15/make_defs.mk | 2 ++ config/cortex-a9/make_defs.mk | 2 ++ config/dunnington/make_defs.mk | 2 ++ config/haswell/make_defs.mk | 2 ++ config/knl/make_defs.mk | 6 +++++- config/loongson3a/make_defs.mk | 2 ++ config/mic/make_defs.mk | 4 ++++ config/piledriver/make_defs.mk | 2 ++ config/pnacl/make_defs.mk | 2 ++ config/power7/make_defs.mk | 2 ++ config/reference/make_defs.mk | 2 ++ config/sandybridge/make_defs.mk | 2 ++ config/template/make_defs.mk | 2 ++ 17 files changed, 39 insertions(+), 1 deletion(-) diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 40b6c179a..2b4125f3a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 654a9ff92..3dc88e913 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 78f47d908..90d14d56b 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index aaecb2d2c..fd6b84cb0 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index ec5360da4..52ab7a7c9 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index ec5360da4..52ab7a7c9 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index fed36506b..f8faa3b5b 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 1640a40b9..4c144846d 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index e0385e6d5..6a750223d 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -95,7 +95,11 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -lmemkind +ifeq ($(CC_VENDOR),icc) +LDFLAGS := -lmemkind +else +LDFLAGS := -lmemkind -lm +endif diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index bb1248d37..2c7e9c58c 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 21af9e2e2..339112570 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -77,7 +77,11 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifeq ($(CC_VENDOR),icc) +LDFLAGS := -mmic +else LDFLAGS := -mmic -lm +endif diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index e241789dd..db46bd124 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e957cf429..9e2a3b4c5 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -63,7 +63,9 @@ ARFLAGS := rcs # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif # --- Determine the finalizer and related flags --- FINALIZER := pnacl-finalize diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index d03857a44..da4e5bff1 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 736e5ee4d..f2f86ba07 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 082a73f92..0a779b188 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 37de32882..98f3222e0 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif From 126482a3b609b9ad7026ba348f6c4bf6a29be8a1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 25 Nov 2016 18:29:49 -0600 Subject: [PATCH 02/64] Implemented the 1m method. Details: - Implemented the 1m method for inducing complex domain matrix multiplication. 1m support has been added to all level-3 operations, including trsm, and is now the default induced method when native complex domain gemm microkernels are omitted from the configuration. - Updated _cntx_init() operations to take a datatype parameter. This was needed for the corresponding function for 1m (because 1m requires us to choose between column-oriented or row-oriented execution, which requires us to query the context for the storage preference of the gemm microkernel, which requires knowing the datatype) but I decided that it made sense for consistency to add the parameter to all other cntx initialization functions as well, even though those functions don't use the parameter. - Updated bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs() to take a second scalar for each blocksize entry. The semantic meaning of the two scalars now is that the first will scale the default blocksize while the second will scale the maximum blocksize. This allows scaling the two independently, and was needed to support 1m, which requires scaling for a register blocksize but not the register storage blocksize (ie: "packdim") analogue. - Deprecated bli_blksz_reduce_dt_to() and defined two new functions, bli_blksz_reduce_def_to() and bli_blksz_reduce_max_to(), for reducing default and maximum blocksizes to some desired blocksize multiple. These functions are needed in the updated definitions of bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs(). - Added support for the 1e and 1r packing schemas to packm, including 1e/1r packing kernels. - Added a minor optimization to bli_gemm_ker_var2() that allows, under certain circumstances (specifically, real domain beta and row- or column-stored matrix C), the real domain macrokernel and microkernel to be called directly, rather than using the virtual microkernel via the complex domain macrokernel, which carries a slight additional amount of overhead. - Added 1m support to the testsuite. - Added 1m support to Makefile and runme.sh in test/3m4m. Also simplified some code in test_gemm.c driver. --- config/haswell/bli_kernel.h | 44 +- frame/1/bli_l1v_cntx.c | 22 +- frame/1/bli_l1v_cntx.h | 2 +- frame/1/bli_l1v_tapi.c | 20 +- frame/1d/bli_l1d_cntx.c | 4 +- frame/1d/bli_l1d_cntx.h | 2 +- frame/1d/bli_l1d_tapi.c | 10 +- frame/1f/bli_l1f_cntx.c | 20 +- frame/1f/bli_l1f_cntx.h | 2 +- frame/1f/bli_l1f_tapi.c | 10 +- frame/1m/bli_l1m_cntx.c | 10 +- frame/1m/bli_l1m_cntx.h | 2 +- frame/1m/bli_l1m_ft.h | 1 + frame/1m/bli_l1m_tapi.c | 25 +- frame/1m/packm/bli_packm.h | 2 + frame/1m/packm/bli_packm_blk_var1.c | 6 + frame/1m/packm/bli_packm_cntx.c | 2 +- frame/1m/packm/bli_packm_cntx.h | 2 +- frame/1m/packm/bli_packm_cxk_1er.c | 489 +++++++ frame/1m/packm/bli_packm_cxk_1er.h | 55 + frame/1m/packm/bli_packm_struc_cxk_1er.c | 610 ++++++++ frame/1m/packm/bli_packm_struc_cxk_1er.h | 117 ++ .../1m/packm/ukernels/bli_packm_cxk_1e_ref.c | 1099 +++++++++++++++ .../1m/packm/ukernels/bli_packm_cxk_1e_ref.h | 62 + .../1m/packm/ukernels/bli_packm_cxk_1r_ref.c | 1254 +++++++++++++++++ .../1m/packm/ukernels/bli_packm_cxk_1r_ref.h | 61 + frame/2/bli_l2_cntx.c | 50 +- frame/2/bli_l2_cntx.h | 2 +- frame/2/bli_l2_tapi.c | 39 +- frame/3/bli_l3_cntx.c | 8 +- frame/3/bli_l3_cntx.h | 2 +- frame/3/gemm/bli_gemm_ker_var2.c | 20 + frame/base/bli_blksz.c | 61 + frame/base/bli_blksz.h | 25 + frame/base/bli_cntx.c | 147 +- frame/base/bli_cntx.h | 8 +- frame/base/bli_gks.c | 173 ++- frame/base/bli_memsys.c | 6 +- frame/include/bli_param_macro_defs.h | 13 + frame/include/bli_scalar_macro_defs.h | 37 + frame/include/bli_type_defs.h | 17 +- frame/include/level0/1e/bli_copy1es.h | 53 + frame/include/level0/1e/bli_copyj1es.h | 53 + frame/include/level0/1e/bli_invert1es.h | 53 + frame/include/level0/1e/bli_scal1es.h | 53 + frame/include/level0/1e/bli_scal21es.h | 65 + frame/include/level0/1e/bli_scal2j1es.h | 65 + .../level0/1m/bli_invert1ms_mxn_diag.h | 126 ++ frame/include/level0/1m/bli_scal1ms_mxn.h | 124 ++ .../include/level0/1m/bli_scal21ms_mxn_diag.h | 126 ++ .../include/level0/1m/bli_scal21ms_mxn_uplo.h | 296 ++++ frame/include/level0/1m/bli_set1ms_mxn.h | 164 +++ frame/include/level0/1m/bli_set1ms_mxn_diag.h | 130 ++ frame/include/level0/1m/bli_set1ms_mxn_uplo.h | 198 +++ .../include/level0/1m/bli_seti01ms_mxn_diag.h | 114 ++ frame/include/level0/1r/bli_copy1rs.h | 51 + frame/include/level0/1r/bli_copyj1rs.h | 51 + frame/include/level0/1r/bli_invert1rs.h | 43 + frame/include/level0/1r/bli_scal1rs.h | 61 + frame/include/level0/1r/bli_scal21rs.h | 61 + frame/include/level0/1r/bli_scal2j1rs.h | 61 + frame/ind/bli_ind.c | 7 +- frame/ind/bli_ind.h | 3 + frame/ind/bli_l3_ind.c | 4 + frame/ind/cntx/bli_gemmind_cntx.c | 290 ++-- frame/ind/cntx/bli_gemmind_cntx.h | 45 +- frame/ind/cntx/bli_trsmind_cntx.c | 127 +- frame/ind/cntx/bli_trsmind_cntx.h | 26 +- frame/ind/include/bli_kernel_1m_macro_defs.h | 107 ++ frame/ind/include/bli_kernel_ind_macro_defs.h | 2 + .../include/bli_kernel_ind_pre_macro_defs.h | 29 + frame/ind/include/bli_packm_1er_macro_defs.h | 241 ++++ frame/ind/include/bli_packm_3mis_macro_defs.h | 3 - frame/ind/include/bli_packm_4mi_macro_defs.h | 3 - .../include/bli_packm_ind_pre_macro_defs.h | 97 ++ frame/ind/include/bli_packm_rih_macro_defs.h | 3 - frame/ind/misc/bli_l3_ind_opt.h | 78 + ...li_l3_3m4m_oapi.c => bli_l3_3m4m1m_oapi.c} | 39 +- frame/ind/oapi/bli_l3_ind_oapi.h | 1 + frame/ind/oapi/bli_l3_nat_oapi.c | 15 +- frame/ind/tapi/bli_l3_ind_tapi.c | 10 + frame/ind/tapi/bli_l3_ind_tapi.h | 10 + frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 179 +++ frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h | 1 + .../ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c | 244 ++++ .../ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h | 3 + frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c | 448 ++++++ .../ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h | 3 + .../trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c | 240 ---- .../trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c | 222 --- .../trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c | 215 --- .../trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c | 203 --- .../ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c | 160 --- .../ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c | 160 --- .../ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c | 169 --- .../ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c | 152 -- frame/util/bli_util_tapi.c | 14 +- test/3m4m/Makefile | 33 +- test/3m4m/runme.sh | 6 +- test/3m4m/test_gemm.c | 24 +- testsuite/input.general | 5 +- testsuite/src/test_axpy2v.c | 2 +- testsuite/src/test_axpyf.c | 2 +- testsuite/src/test_dotaxpyv.c | 2 +- testsuite/src/test_dotxaxpyf.c | 2 +- testsuite/src/test_dotxf.c | 2 +- testsuite/src/test_gemm.c | 2 - testsuite/src/test_gemm_ukr.c | 2 +- testsuite/src/test_gemmtrsm_ukr.c | 2 +- testsuite/src/test_libblis.c | 68 +- testsuite/src/test_trsm_ukr.c | 2 +- version | 2 +- 112 files changed, 8158 insertions(+), 2040 deletions(-) create mode 100644 frame/1m/packm/bli_packm_cxk_1er.c create mode 100644 frame/1m/packm/bli_packm_cxk_1er.h create mode 100644 frame/1m/packm/bli_packm_struc_cxk_1er.c create mode 100644 frame/1m/packm/bli_packm_struc_cxk_1er.h create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c create mode 100644 frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h create mode 100644 frame/include/level0/1e/bli_copy1es.h create mode 100644 frame/include/level0/1e/bli_copyj1es.h create mode 100644 frame/include/level0/1e/bli_invert1es.h create mode 100644 frame/include/level0/1e/bli_scal1es.h create mode 100644 frame/include/level0/1e/bli_scal21es.h create mode 100644 frame/include/level0/1e/bli_scal2j1es.h create mode 100644 frame/include/level0/1m/bli_invert1ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_scal1ms_mxn.h create mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_scal21ms_mxn_uplo.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn_diag.h create mode 100644 frame/include/level0/1m/bli_set1ms_mxn_uplo.h create mode 100644 frame/include/level0/1m/bli_seti01ms_mxn_diag.h create mode 100644 frame/include/level0/1r/bli_copy1rs.h create mode 100644 frame/include/level0/1r/bli_copyj1rs.h create mode 100644 frame/include/level0/1r/bli_invert1rs.h create mode 100644 frame/include/level0/1r/bli_scal1rs.h create mode 100644 frame/include/level0/1r/bli_scal21rs.h create mode 100644 frame/include/level0/1r/bli_scal2j1rs.h create mode 100644 frame/ind/include/bli_kernel_1m_macro_defs.h create mode 100644 frame/ind/include/bli_packm_1er_macro_defs.h create mode 100644 frame/ind/misc/bli_l3_ind_opt.h rename frame/ind/oapi/{bli_l3_3m4m_oapi.c => bli_l3_3m4m1m_oapi.c} (92%) create mode 100644 frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c create mode 100644 frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c create mode 100644 frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c delete mode 100644 frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 6eee7c483..ce18dc266 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -51,17 +51,6 @@ // -- sgemm micro-kernel -- -#if 1 -#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 -#define BLIS_DEFAULT_MC_S 144 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 4080 -#define BLIS_DEFAULT_MR_S 6 -#define BLIS_DEFAULT_NR_S 16 - -#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 @@ -73,6 +62,17 @@ #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 +#define BLIS_DEFAULT_MC_S 144 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 4080 +#define BLIS_DEFAULT_MR_S 6 +#define BLIS_DEFAULT_NR_S 16 + +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 @@ -84,17 +84,6 @@ // -- dgemm micro-kernel -- -#if 1 -#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 -#define BLIS_DEFAULT_MC_D 72 -#define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 4080 -#define BLIS_DEFAULT_MR_D 6 -#define BLIS_DEFAULT_NR_D 8 - -#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 @@ -106,6 +95,17 @@ #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 +#define BLIS_DEFAULT_MC_D 72 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4080 +#define BLIS_DEFAULT_MR_D 6 +#define BLIS_DEFAULT_NR_D 8 + +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index bdbb0063f..149c20320 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname, kertype ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ @@ -68,15 +68,15 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2, dep3, dep4 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ - PASTEMAC(dep3,_cntx_init)( cntx ); \ - PASTEMAC(dep4,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep3,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep4,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -93,12 +93,12 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -116,13 +116,13 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index 95cd4a131..85756363b 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 74a548eea..6abf002f5 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -53,7 +53,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -88,7 +88,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -123,7 +123,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -198,7 +198,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -274,7 +274,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -306,7 +306,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -340,7 +340,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -373,7 +373,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index d285995b1..443dc20f7 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1d/bli_l1d_cntx.h b/frame/1d/bli_l1d_cntx.h index 50db79738..e5ab92f51 100644 --- a/frame/1d/bli_l1d_cntx.h +++ b/frame/1d/bli_l1d_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addd ) diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index 5ef92603a..c8a67a138 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -90,7 +90,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -166,7 +166,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -222,7 +222,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -276,7 +276,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -349,7 +349,7 @@ void PASTEMAC(ch,opname) \ incx = 2*incx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(chr,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx_p ); \ diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 379cbce7d..58ca4a07c 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -63,13 +63,13 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -86,12 +86,12 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -114,13 +114,13 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1f/bli_l1f_cntx.h b/frame/1f/bli_l1f_cntx.h index 86b3af25f..bea56ca40 100644 --- a/frame/1f/bli_l1f_cntx.h +++ b/frame/1f/bli_l1f_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( axpy2v ) diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index a7efd91f8..8c77a2465 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -99,7 +99,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -142,7 +142,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 8569416fd..7eb3dcd4c 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ @@ -64,13 +64,13 @@ GENFRONT( subm, subv ) #undef GENFRONT #define GENFRONT( opname, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1m/bli_l1m_cntx.h b/frame/1m/bli_l1m_cntx.h index 46524fa0b..79e0524e8 100644 --- a/frame/1m/bli_l1m_cntx.h +++ b/frame/1m/bli_l1m_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addm ) diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 4361c9fac..2790bd006 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -91,6 +91,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ ); INSERT_GENTDEF( packm_cxk_ker ) +INSERT_GENTDEF( packm_cxk_1er_ker ) // packm_3mis_ker diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index c4dc5f9a8..13da24e59 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -54,12 +54,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -118,12 +119,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -187,7 +189,8 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -195,7 +198,7 @@ void PASTEMAC(ch,opname) \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -256,12 +259,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If alpha is zero, then we set the output matrix to zero. This seemingly minor optimization is important because it will clear @@ -344,12 +348,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 7a44ecb9f..991487dfd 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -48,9 +48,11 @@ #include "bli_packm_struc_cxk_4mi.h" #include "bli_packm_struc_cxk_3mis.h" #include "bli_packm_struc_cxk_rih.h" +#include "bli_packm_struc_cxk_1er.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4mi.h" #include "bli_packm_cxk_3mis.h" #include "bli_packm_cxk_rih.h" +#include "bli_packm_cxk_1er.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 4ce7b1504..055d30f1f 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -90,6 +90,12 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = // 0111 row/col panels: real+imaginary only { { NULL, bli_cpackm_struc_cxk_rih, NULL, bli_zpackm_struc_cxk_rih, } }, +// 1000 row/col panels: 1m-expanded (1e) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, +// 1001 row/col panels: 1m-reordered (1r) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, }; diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 4f570400a..2f4e0b030 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -39,7 +39,7 @@ // Define context initialization functions. // -void bli_packm_cntx_init( cntx_t* cntx ) +void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { bli_cntx_obj_create( cntx ); diff --git a/frame/1m/packm/bli_packm_cntx.h b/frame/1m/packm/bli_packm_cntx.h index 1ab4df826..2210a777b 100644 --- a/frame/1m/packm/bli_packm_cntx.h +++ b/frame/1m/packm/bli_packm_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( packm ) diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/packm/bli_packm_cxk_1er.c new file mode 100644 index 000000000..352ae8353 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.c @@ -0,0 +1,489 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_1er_ker_vft + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 32 + +static FUNCPTR_T ftypes_e[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1E_KERNEL, + NULL, BLIS_ZPACKM_2XK_1E_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1E_KERNEL, + NULL, BLIS_ZPACKM_4XK_1E_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1E_KERNEL, + NULL, BLIS_ZPACKM_6XK_1E_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1E_KERNEL, + NULL, BLIS_ZPACKM_8XK_1E_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1E_KERNEL, + NULL, BLIS_ZPACKM_10XK_1E_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1E_KERNEL, + NULL, BLIS_ZPACKM_12XK_1E_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1E_KERNEL, + NULL, BLIS_ZPACKM_14XK_1E_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1E_KERNEL, + NULL, BLIS_ZPACKM_16XK_1E_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1E_KERNEL, + NULL, BLIS_ZPACKM_30XK_1E_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + +static FUNCPTR_T ftypes_r[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1R_KERNEL, + NULL, BLIS_ZPACKM_2XK_1R_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, BLIS_CPACKM_3XK_1R_KERNEL, + NULL, BLIS_ZPACKM_3XK_1R_KERNEL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1R_KERNEL, + NULL, BLIS_ZPACKM_4XK_1R_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1R_KERNEL, + NULL, BLIS_ZPACKM_6XK_1R_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1R_KERNEL, + NULL, BLIS_ZPACKM_8XK_1R_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1R_KERNEL, + NULL, BLIS_ZPACKM_10XK_1R_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1R_KERNEL, + NULL, BLIS_ZPACKM_12XK_1R_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1R_KERNEL, + NULL, BLIS_ZPACKM_14XK_1R_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1R_KERNEL, + NULL, BLIS_ZPACKM_16XK_1R_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1R_KERNEL, + NULL, BLIS_ZPACKM_30XK_1R_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the micro-panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) \ + { \ + if ( bli_is_1e_packed( schema ) ) f = ftypes_e[panel_dim][dt]; \ + else /*( bli_is_1r_packed( schema ) )*/ f = ftypes_r[panel_dim][dt]; \ + } \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f \ + ( \ + conja, \ + panel_len, \ + kappa, \ + a, inca, lda, \ + p, ldp \ + ); \ + } \ + else \ + { \ + dim_t i, j; \ +\ + if ( bli_is_1e_packed( schema ) ) \ + { \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict a_ri = ( ctype* )a; \ + ctype* restrict p_ri = ( ctype* )p; \ + ctype* restrict p_ir = ( ctype* )p + ldp/2; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2j1es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal21es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + 1; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype_r* restrict p_i = ( ctype_r* )p + ldp; \ + const dim_t inca2 = 2*inca; \ + const dim_t lda2 = 2*lda; \ + const dim_t ldp2 = 2*ldp; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2jris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2ris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/packm/bli_packm_cxk_1er.h new file mode 100644 index 000000000..bd87216d0 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_cxk_1e_ref.h" +#include "bli_packm_cxk_1r_ref.h" + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c new file mode 100644 index 000000000..6ed34808f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -0,0 +1,610 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + schema, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + { \ + if ( m_panel != m_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = m_panel; \ + dim_t offn = 0; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = 0; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = m_panel; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +\ +\ +/* + if ( bli_is_1r_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ + \ + if ( bli_is_1e_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype* restrict c10; \ + ctype* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp, \ + cntx \ + ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp, \ + cntx \ + ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t j = diagoffc_abs; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype* restrict p11 = p + (j )*ldp; \ +\ + PASTEMAC(ch,scal21ms_mxn_uplo) \ + ( \ + schema, \ + uploc, \ + conjc, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11, rs_p, cs_p, ldp \ + ); \ +\ + /* If we are packing a micro-panel with Hermitian structure, + we must take special care of the diagonal. Now, if kappa + were guaranteed to be unit, all we would need to do is + explicitly zero out the imaginary part of the diagonal of + p11, in case the diagonal of the source matrix contained + garbage (non-zero) imaginary values. HOWEVER, since kappa + can be non-unit, things become a little more complicated. + In general, we must re-apply the kappa scalar to ONLY the + real part of the diagonal of the source matrix and save + the result to the diagonal of p11. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + ctype_r* restrict c11_r = ( ctype_r* )c11; \ + const dim_t rs_c2 = 2*rs_c; \ + const dim_t cs_c2 = 2*cs_c; \ +\ + PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ + ( \ + schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + c11_r, rs_c2, cs_c2, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffp_abs = bli_abs( diagoffp ); \ + ctype* p11 = p + (diagoffp_abs )*ldp; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + kappa, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + PASTEMAC(ch,invert1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + uplo_t uplop = uploc; \ + doff_t diagoffp11_0 = 0; \ + dim_t p11_0_dim = panel_dim - 1; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp11_0 ); \ +\ + /* Note that this macro works a little differently than the setm + operation. Here, we pass in the dimensions of only p11, rather + than the whole micro-panel, and furthermore we pass in the + "shrunken" dimensions of p11, corresponding to the toggling + and shrinking of the diagonal above. The macro will do the + right thing, incrementing the pointer to p11 by the appropriate + leading dimension (cs_p or rs_p), and setting only the lower + or upper triangle to zero. */ \ + PASTEMAC(ch,set1ms_mxn_uplo) \ + ( \ + schema, \ + diagoffp11_0, \ + uplop, \ + p11_0_dim, \ + p11_0_dim, \ + zero, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h new file mode 100644 index 000000000..b0b1d0a2f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_1er ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c new file mode 100644 index 000000000..9f2acdce8 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c @@ -0,0 +1,1099 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1e_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h new file mode 100644 index 000000000..beebdafdc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1e_ref +// 1e format should probably never have an odd-numbered register blocking. +//#undef packm_3xk_ker_name +//#define packm_3xk_ker_name packm_3xk_1e_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1e_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1e_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1e_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1e_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1e_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1e_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1e_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1e_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c new file mode 100644 index 000000000..6e30ca5bc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c @@ -0,0 +1,1254 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_3xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1r_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h new file mode 100644 index 000000000..a6e3f0aef --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1r_ref +#undef packm_3xk_ker_name +#define packm_3xk_ker_name packm_3xk_1r_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1r_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1r_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1r_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1r_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1r_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1r_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1r_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1r_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index 841217365..fdfe27a85 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -50,20 +50,20 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -88,7 +88,7 @@ GENFRONT( trsv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -96,10 +96,10 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /* Initialize the context with kernels employed by the current operation. */ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -122,7 +122,7 @@ GENFRONT( syr ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -133,22 +133,22 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXAXPYF_KER, cntx );*/ \ - bli_dotaxpyv_cntx_init( cntx ); \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ - bli_dotxaxpyf_cntx_init( cntx ); \ + bli_dotaxpyv_cntx_init( dt, cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ + bli_dotxaxpyf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -173,7 +173,7 @@ GENFRONT( symv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -182,11 +182,11 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPY2V_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpy2v_cntx_init( cntx ); \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpy2v_cntx_init( dt, cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ diff --git a/frame/2/bli_l2_cntx.h b/frame/2/bli_l2_cntx.h index 8b6566f55..a3bafa0c8 100644 --- a/frame/2/bli_l2_cntx.h +++ b/frame/2/bli_l2_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemv ) diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c index 24558fd9d..f2681d7d8 100644 --- a/frame/2/bli_l2_tapi.c +++ b/frame/2/bli_l2_tapi.c @@ -55,8 +55,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - dim_t m_y, n_x; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + dim_t m_y, n_x; \ \ /* Determine the dimensions of y and x. */ \ bli_set_dims_with_trans( transa, m, n, m_y, n_x ); \ @@ -65,7 +66,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( m_y ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -135,13 +136,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x or y has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -188,10 +190,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -261,8 +264,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - ctype alpha_local; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + ctype alpha_local; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \ @@ -273,7 +277,7 @@ void PASTEMAC(ch,opname) \ PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -324,13 +328,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -383,13 +388,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -444,10 +450,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 634e4c1ab..8b4b01572 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -38,7 +38,7 @@ // Define context initialization functions. // -void bli_gemm_cntx_init( cntx_t* cntx ) +void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -49,7 +49,7 @@ void bli_gemm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. @@ -74,7 +74,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm_cntx_init( cntx_t* cntx ) +void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -92,7 +92,7 @@ void bli_trsm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. diff --git a/frame/3/bli_l3_cntx.h b/frame/3/bli_l3_cntx.h index 21b756656..223fa5e25 100644 --- a/frame/3/bli_l3_cntx.h +++ b/frame/3/bli_l3_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemm ) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 8af29594d..c27a0b67c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -109,6 +109,26 @@ void bli_gemm_ker_var2 buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. +#if 1 + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } +#endif + // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 833dadb42..0f8e38688 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -85,6 +85,7 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, @@ -116,6 +117,66 @@ void bli_blksz_reduce_dt_to bli_blksz_set_def( blksz_def, dt_bs, blksz ); bli_blksz_set_max( blksz_max, dt_bs, blksz ); } +#endif + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the default and maximum blocksize values down to their + // respective nearest multiples of bmult_val. (Notice that we + // ignore the "max" entry in the bmult object since that would + // correspond to the packing dimension, which plays no role + // as a blocksize multiple.) + blksz_def = ( blksz_def / bmult_val ) * bmult_val; + + // Make sure the new blocksize values are at least the blocksize + // multiple. + if ( blksz_def == 0 ) blksz_def = bmult_val; + + // Store the new blocksizes back to the object. + bli_blksz_set_def( blksz_def, dt_bs, blksz ); +} + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the blocksize values down to its nearest multiple of + // of bmult_val. (Notice that we ignore the "max" entry in the + // bmult object since that would correspond to the packing + // dimension, which plays no role as a blocksize multiple.) + blksz_max = ( blksz_max / bmult_val ) * bmult_val; + + // Make sure the new blocksize value is at least the blocksize + // multiple. + if ( blksz_max == 0 ) blksz_max = bmult_val; + + // Store the new blocksize back to the object. + bli_blksz_set_max( blksz_max, dt_bs, blksz ); +} // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index daffb3772..cfe2023e1 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -89,11 +89,23 @@ (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ } +#define bli_blksz_scale_def( num, den, dt, b ) \ +{ \ + (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ +} + +#define bli_blksz_scale_max( num, den, dt, b ) \ +{ \ + (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ +} + +#if 0 #define bli_blksz_scale_dt_by( num, den, dt, b ) \ { \ (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ } +#endif // ----------------------------------------------------------------------------- @@ -121,12 +133,25 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); +#endif +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index bd9972332..e4299eb49 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -386,27 +386,27 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_cntx_set_blkszs( + void bli_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; @@ -414,7 +414,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszid_t* bszids; blksz_t** blkszs; bszid_t* bmults; - dim_t* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; @@ -426,7 +427,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( dim_t ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -444,9 +446,9 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -464,18 +466,21 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); - const dim_t scalr = va_arg( args, dim_t ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -510,12 +515,12 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* blksz = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. Do the same @@ -534,14 +539,15 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; - const dim_t scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t* blksz = blkszs[ i ]; - blksz_t* bmult = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; + blksz_t* bmult = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. @@ -550,20 +556,50 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -575,7 +611,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( blkszs ); bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } #endif diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 38bff6720..9c97c3312 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -488,13 +488,13 @@ void bli_cntx_print( cntx_t* cntx ); // pointer is NULL. When initializing, the context address that should // be used (local or external) is assigned to cntx_p. -#define bli_cntx_init_local_if( opname, cntx, cntx_p ) \ +#define bli_cntx_init_local_if( opname, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC(opname,_cntx_init)( &_cntx_l ); \ + PASTEMAC(opname,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ @@ -510,13 +510,13 @@ void bli_cntx_print( cntx_t* cntx ); } -#define bli_cntx_init_local_if2( opname, suf, cntx, cntx_p ) \ +#define bli_cntx_init_local_if2( opname, suf, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC2(opname,suf,_cntx_init)( &_cntx_l ); \ + PASTEMAC2(opname,suf,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 7f3f897d5..32f99a832 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -94,48 +94,47 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_gks_cntx_set_blkszs( + void bli_gks_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, + bszid_t bs1_id, bszid_t bm1_id, + bszid_t bs2_id, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, - bszid_t bs1_id, bszid_t bm1_id, - bszid_t bs2_id, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_gks_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_gks_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; bszid_t* bszids; bszid_t* bmults; - double* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; blksz_t* cntx_blkszs; bszid_t* cntx_bmults; - bszid_t bs_id; - bszid_t bm_id; - double scalr; // Allocate some temporary local arrays. bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( double ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -152,8 +151,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -169,16 +168,19 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); - scalr = va_arg( args, double ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -210,10 +212,10 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blocksize multiple id. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and save // them directly into the appropriate location in the context's @@ -231,41 +233,75 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blocksize multiple id, // and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - double scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t blksz; - blksz_t bmult; + blksz_t blksz_l; + blksz_t bmult_l; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* blksz = &blksz_l; + blksz_t* bmult = &bmult_l; + + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and bm_id // and use them to populate a pair of local blksz_t objects. - bli_gks_get_blksz( bs_id, &blksz ); - bli_gks_get_blksz( bm_id, &bmult ); + bli_gks_get_blksz( bs_id, blksz ); + bli_gks_get_blksz( bm_id, bmult ); // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_FLOAT, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DOUBLE, cntx_blksz ); - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1.0 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, &bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, &bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -276,7 +312,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Free the temporary local arrays. bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } @@ -337,6 +374,18 @@ static func_t bli_gks_l3_ind_ukrs[BLIS_NUM_IND_METHODS] /* trsm_l */ { { NULL, BLIS_CTRSM4M1_L_UKERNEL, NULL, BLIS_ZTRSM4M1_L_UKERNEL, } }, /* trsm_u */ { { NULL, BLIS_CTRSM4M1_U_UKERNEL, NULL, BLIS_ZTRSM4M1_U_UKERNEL, } }, }, +/* 1m */ { +/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM1M_UKERNEL, + BLIS_DGEMM_UKERNEL, BLIS_ZGEMM1M_UKERNEL, } }, +/* gemmtrsm_l */ { { NULL, BLIS_CGEMMTRSM1M_L_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_L_UKERNEL, } }, +/* gemmtrsm_u */ { { NULL, BLIS_CGEMMTRSM1M_U_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_U_UKERNEL, } }, +/* trsm_l */ { { NULL, BLIS_CTRSM1M_L_UKERNEL, + NULL, BLIS_ZTRSM1M_L_UKERNEL, } }, +/* trsm_u */ { { NULL, BLIS_CTRSM1M_U_UKERNEL, + NULL, BLIS_ZTRSM1M_U_UKERNEL, } }, + }, /* nat */ { /* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL, } }, @@ -565,6 +614,8 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, // -- packm structure-aware kernel structure ----------------------------------- // +// IF ENABLED: NEEDS UPDATING FOR 1M. + static func_t bli_gks_packm_struc_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index e66aafa63..06cbae587 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -61,8 +61,10 @@ void bli_memsys_init( void ) if ( bli_memsys_is_init == TRUE ) return; // Create and initialize a context for gemm so we have something - // to pass into bli_membrk_init_pools(). - bli_gemm_cntx_init( &cntx ); + // to pass into bli_membrk_init_pools(). We use BLIS_DOUBLE for + // the datatype, but the dt argument is actually only used when + // initializing contexts for induced methods. + bli_gemm_cntx_init( BLIS_DOUBLE, &cntx ); #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 50ddd5d1f..f0a208886 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -654,6 +654,19 @@ bli_is_io_packed( schema ) || \ bli_is_rpi_packed( schema ) ) +#define bli_is_1r_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ) + +#define bli_is_1e_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ) + +#define bli_is_1m_packed( schema ) \ +\ + ( bli_is_1r_packed( schema ) || \ + bli_is_1e_packed( schema ) ) + #define bli_is_nat_packed( schema ) \ \ ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ) diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 1069a40b4..de8dbf370 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -225,6 +225,43 @@ #include "bli_scal2jrpis.h" +// -- 1m-specific scalar macros -- + +#include "bli_invert1ms_mxn_diag.h" + +#include "bli_scal1ms_mxn.h" + +#include "bli_scal21ms_mxn_diag.h" +#include "bli_scal21ms_mxn_uplo.h" + +#include "bli_set1ms_mxn.h" +#include "bli_set1ms_mxn_diag.h" +#include "bli_set1ms_mxn_uplo.h" +#include "bli_seti01ms_mxn_diag.h" + +// 1e +#include "bli_copy1es.h" +#include "bli_copyj1es.h" + +#include "bli_invert1es.h" + +#include "bli_scal1es.h" + +#include "bli_scal21es.h" +#include "bli_scal2j1es.h" + +// 1r +#include "bli_copy1rs.h" +#include "bli_copyj1rs.h" + +#include "bli_invert1rs.h" + +#include "bli_scal1rs.h" + +#include "bli_scal21rs.h" +#include "bli_scal2j1rs.h" + + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d3548031c..c4cfd3514 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -224,6 +224,10 @@ typedef dcomplex f77_dcomplex; - 1 0110 11: packed imag-only column panels - 1 0111 10: packed real+imag row panels - 1 0111 11: packed real+imag column panels + - 1 1000 10: packed by 1m expanded row panels + - 1 1000 11: packed by 1m expanded column panels + - 1 1001 10: packed by 1m reordered row panels + - 1 1001 11: packed by 1m reordered column panels 23 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper @@ -329,6 +333,8 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) @@ -348,6 +354,10 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -469,13 +479,17 @@ typedef enum BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, + BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, + BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the // schema pair for "4ms" (4m separated), because its bit value has // been reserved, even though we don't use it. -#define BLIS_NUM_PACK_SCHEMA_TYPES 8 +#define BLIS_NUM_PACK_SCHEMA_TYPES 10 // -- Pack order type -- @@ -575,6 +589,7 @@ typedef enum BLIS_4MH, BLIS_4M1B, BLIS_4M1A, + BLIS_1M, BLIS_NAT, } ind_t; diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h new file mode 100644 index 000000000..22eec1565 --- /dev/null +++ b/frame/include/level0/1e/bli_copy1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1ES_H +#define BLIS_COPY1ES_H + +// copy1es + +#define bli_ccopy1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopy1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h new file mode 100644 index 000000000..14c401354 --- /dev/null +++ b/frame/include/level0/1e/bli_copyj1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1ES_H +#define BLIS_COPYJ1ES_H + +// copyj1es + +#define bli_ccopyj1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopyj1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_invert1es.h b/frame/include/level0/1e/bli_invert1es.h new file mode 100644 index 000000000..2fe5c3f24 --- /dev/null +++ b/frame/include/level0/1e/bli_invert1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1ES_H +#define BLIS_INVERT1ES_H + +// invert1es + +#define bli_cinvert1es( bri, bir ) \ +{ \ + bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ +} + +#define bli_zinvert1es( bri, bir ) \ +{ \ + bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal1es.h b/frame/include/level0/1e/bli_scal1es.h new file mode 100644 index 000000000..46ee20a0d --- /dev/null +++ b/frame/include/level0/1e/bli_scal1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1ES_H +#define BLIS_SCAL1ES_H + +// scal1es + +#define bli_cscal1es( a, yri, yir ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ + bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal1es( a, yri, yir ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h new file mode 100644 index 000000000..7e0a752bc --- /dev/null +++ b/frame/include/level0/1e/bli_scal21es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21ES_H +#define BLIS_SCAL21ES_H + +// scal21es + +#define bli_cscal21es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal21es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal21es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal21es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h new file mode 100644 index 000000000..b10004f61 --- /dev/null +++ b/frame/include/level0/1e/bli_scal2j1es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1ES_H +#define BLIS_SCAL2J1ES_H + +// scal2j1es + +#define bli_cscal2j1es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal2j1es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal2j1es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal2j1es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1m/bli_invert1ms_mxn_diag.h b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h new file mode 100644 index 000000000..7abf891ef --- /dev/null +++ b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1MS_MXN_DIAG_H +#define BLIS_INVERT1MS_MXN_DIAG_H + +// invert1ms_mxn_diag + +#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal1ms_mxn.h b/frame/include/level0/1m/bli_scal1ms_mxn.h new file mode 100644 index 000000000..a0a9c595f --- /dev/null +++ b/frame/include/level0/1m/bli_scal1ms_mxn.h @@ -0,0 +1,124 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1MS_MXN_H +#define BLIS_SCAL1MS_MXN_H + +// scal1ms_mxn + +#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, + which steps in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_diag.h b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h new file mode 100644 index 000000000..a8975f731 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_DIAG_H +#define BLIS_SCAL21MS_MXN_DIAG_H + +// scal21ms_mxn_diag + +#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h new file mode 100644 index 000000000..ccd5d4ef8 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h @@ -0,0 +1,296 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_UPLO_H +#define BLIS_SCAL21MS_MXN_UPLO_H + +// scal21ms_mxn_uplo + +#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h new file mode 100644 index 000000000..9f701c919 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn.h @@ -0,0 +1,164 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_H +#define BLIS_SET1MS_MXN_H + +// set1ms_mxn + +#define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_diag.h b/frame/include/level0/1m/bli_set1ms_mxn_diag.h new file mode 100644 index 000000000..63262dd18 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_diag.h @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_DIAG_H +#define BLIS_SET1MS_MXN_DIAG_H + +// set1ms_mxn_diag + +#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_uplo.h b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h new file mode 100644 index 000000000..e89f9a34d --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h @@ -0,0 +1,198 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_UPLO_H +#define BLIS_SET1MS_MXN_UPLO_H + +// set1ms_mxn_uplo + +#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + scomplex* restrict y_ri = y0; \ + scomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ + float* restrict y_r = y0; \ + float* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + dcomplex* restrict y_ri = y0; \ + dcomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ + double* restrict y_r = y0; \ + double* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_seti01ms_mxn_diag.h b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h new file mode 100644 index 000000000..39be51ca5 --- /dev/null +++ b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETI01MS_MXN_DIAG_H +#define BLIS_SETI01MS_MXN_DIAG_H + +// seti01ms_mxn_diag + +#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1r/bli_copy1rs.h b/frame/include/level0/1r/bli_copy1rs.h new file mode 100644 index 000000000..d60cf9d86 --- /dev/null +++ b/frame/include/level0/1r/bli_copy1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1RS_H +#define BLIS_COPY1RS_H + +// copy1rs + +#define bli_ccopy1rs( a, br, bi ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopy1rs( a, br, bi ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_copyj1rs.h b/frame/include/level0/1r/bli_copyj1rs.h new file mode 100644 index 000000000..8cc82f558 --- /dev/null +++ b/frame/include/level0/1r/bli_copyj1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1RS_H +#define BLIS_COPYJ1RS_H + +// copyj1rs + +#define bli_ccopyj1rs( a, br, bi ) \ +{ \ + bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopyj1rs( a, br, bi ) \ +{ \ + bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_invert1rs.h b/frame/include/level0/1r/bli_invert1rs.h new file mode 100644 index 000000000..3b3a6950c --- /dev/null +++ b/frame/include/level0/1r/bli_invert1rs.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1RS_H +#define BLIS_INVERT1RS_H + +// invert1rs + +#define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) +#define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) + +#endif diff --git a/frame/include/level0/1r/bli_scal1rs.h b/frame/include/level0/1r/bli_scal1rs.h new file mode 100644 index 000000000..ec65ab664 --- /dev/null +++ b/frame/include/level0/1r/bli_scal1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1RS_H +#define BLIS_SCAL1RS_H + +// scal1rs + +#define bli_cscal1rs( a, yr, yi ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ +} + +#define bli_zscal1rs( a, yr, yi ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ +} + +#define bli_scscal1rs( a, yr, yi ) \ +{ \ + bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ +} + +#define bli_dzscal1rs( a, yr, yi ) \ +{ \ + bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal21rs.h b/frame/include/level0/1r/bli_scal21rs.h new file mode 100644 index 000000000..44d4f083f --- /dev/null +++ b/frame/include/level0/1r/bli_scal21rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21RS_H +#define BLIS_SCAL21RS_H + +// scal21rs + +#define bli_cscal21rs( a, x, yr, yi ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal21rs( a, x, yr, yi ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal21rs( a, x, yr, yi ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal21rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal2j1rs.h b/frame/include/level0/1r/bli_scal2j1rs.h new file mode 100644 index 000000000..6a356932f --- /dev/null +++ b/frame/include/level0/1r/bli_scal2j1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1RS_H +#define BLIS_SCAL2J1RS_H + +// scal2j1rs + +#define bli_cscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_scscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/ind/bli_ind.c b/frame/ind/bli_ind.c index e715b2aad..f0aec685b 100644 --- a/frame/ind/bli_ind.c +++ b/frame/ind/bli_ind.c @@ -45,6 +45,7 @@ static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = /* 4mh */ "4mh", /* 4m1b */ "4m1b", /* 4m1a */ "4m1a", +/* 1m */ "1m", /* nat */ "native", }; @@ -56,10 +57,12 @@ void bli_ind_init( void ) if ( bli_ind_is_initialized() ) return; #ifdef BLIS_ENABLE_INDUCED_SCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX ); #endif #ifdef BLIS_ENABLE_INDUCED_DCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX ); #endif // Mark API as initialized. diff --git a/frame/ind/bli_ind.h b/frame/ind/bli_ind.h index b34941d91..e0ceb383b 100644 --- a/frame/ind/bli_ind.h +++ b/frame/ind/bli_ind.h @@ -44,6 +44,9 @@ // level-3 typed APIs #include "bli_l3_ind_tapi.h" +// level-3 misc. optimizations +#include "bli_l3_ind_opt.h" + // level-3 cntx initialization #include "bli_gemmind_cntx.h" #include "bli_trsmind_cntx.h" diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index e2d1a0f86..e694f5384 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -51,6 +51,8 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = NULL, NULL, NULL, NULL, NULL }, /* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1, bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, +/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m, + bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m }, /* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat, bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, }; @@ -76,6 +78,8 @@ static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, +/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, + {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, }; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index a484cf1a1..ce40bb105 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -34,23 +34,35 @@ #include "blis.h" -typedef void (*cntx_ft)( cntx_t* cntx ); +typedef void (*cntx_init_ft)( num_t dt, cntx_t* cntx ); +typedef void (*cntx_finalize_ft)( cntx_t* cntx ); -static void* bli_gemmind_cntx_fp[BLIS_NUM_IND_METHODS][2] = +static void* bli_gemmind_cntx_init_fp[BLIS_NUM_IND_METHODS] = { - /* _cntx_init _cntx_finalize */ -/* 3mh */ { bli_gemm3mh_cntx_init, bli_gemm3mh_cntx_finalize }, -/* 3m3 */ { bli_gemm3m3_cntx_init, bli_gemm3m3_cntx_finalize }, -/* 3m2 */ { bli_gemm3m2_cntx_init, bli_gemm3m2_cntx_finalize }, -/* 3m1 */ { bli_gemm3m1_cntx_init, bli_gemm3m1_cntx_finalize }, -/* 4mh */ { bli_gemm4mh_cntx_init, bli_gemm4mh_cntx_finalize }, -/* 4mb */ { bli_gemm4mb_cntx_init, bli_gemm4mb_cntx_finalize }, -/* 4m1 */ { bli_gemm4m1_cntx_init, bli_gemm4m1_cntx_finalize }, -/* nat */ { bli_gemmnat_cntx_init, bli_gemmnat_cntx_finalize } +/* 3mh */ bli_gemm3mh_cntx_init, +/* 3m3 */ bli_gemm3m3_cntx_init, +/* 3m2 */ bli_gemm3m2_cntx_init, +/* 3m1 */ bli_gemm3m1_cntx_init, +/* 4mh */ bli_gemm4mh_cntx_init, +/* 4mb */ bli_gemm4mb_cntx_init, +/* 4m1 */ bli_gemm4m1_cntx_init, +/* 1m */ bli_gemm1m_cntx_init, +/* nat */ bli_gemmnat_cntx_init +}; + +static void* bli_gemmind_cntx_finalize_fp[BLIS_NUM_IND_METHODS] = +{ +/* 3mh */ bli_gemm3mh_cntx_finalize, +/* 3m3 */ bli_gemm3m3_cntx_finalize, +/* 3m2 */ bli_gemm3m2_cntx_finalize, +/* 3m1 */ bli_gemm3m1_cntx_finalize, +/* 4mh */ bli_gemm4mh_cntx_finalize, +/* 4mb */ bli_gemm4mb_cntx_finalize, +/* 4m1 */ bli_gemm4m1_cntx_finalize, +/* 1m */ bli_gemm1m_cntx_finalize, +/* nat */ bli_gemmnat_cntx_finalize }; -#define BLIS_CNTX_INIT_INDEX 0 -#define BLIS_CNTX_FINALIZE_INDEX 1 // ----------------------------------------------------------------------------- @@ -62,7 +74,7 @@ void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ) { ind_t method = bli_ind_oper_find_avail( BLIS_GEMM, dt ); - bli_gemmind_cntx_init( method, cntx ); + bli_gemmind_cntx_init( method, dt, cntx ); } void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) @@ -77,16 +89,16 @@ void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) // Execute the context initialization/finalization function associated // with a given induced method. -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ) +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_init_get_func( method ); + cntx_init_ft func = bli_gemmind_cntx_init_get_func( method ); - func( cntx ); + func( dt, cntx ); } void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_finalize_get_func( method ); + cntx_finalize_ft func = bli_gemmind_cntx_finalize_get_func( method ); func( cntx ); } @@ -95,17 +107,17 @@ void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) void* bli_gemmind_cntx_init_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_INIT_INDEX ]; + return bli_gemmind_cntx_init_fp[ method ]; } void* bli_gemmind_cntx_finalize_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_FINALIZE_INDEX ]; + return bli_gemmind_cntx_finalize_fp[ method ]; } // ----------------------------------------------------------------------------- -void bli_gemm3m1_cntx_init( cntx_t* cntx ) +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -122,18 +134,21 @@ void bli_gemm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, @@ -151,7 +166,7 @@ void bli_gemm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m2_cntx_init( cntx_t* cntx ) +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M2; @@ -168,18 +183,21 @@ void bli_gemm3m2_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 3.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 3.0, 3.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, @@ -197,7 +215,7 @@ void bli_gemm3m2_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m3_cntx_init( cntx_t* cntx ) +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M3; @@ -214,18 +232,21 @@ void bli_gemm3m3_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -256,7 +277,7 @@ void bli_gemm3m3_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3mh_cntx_init( cntx_t* cntx ) +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3MH; @@ -273,18 +294,21 @@ void bli_gemm3mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -318,7 +342,7 @@ void bli_gemm3mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4m1_cntx_init( cntx_t* cntx ) +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -335,18 +359,21 @@ void bli_gemm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -364,7 +391,7 @@ void bli_gemm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mb_cntx_init( cntx_t* cntx ) +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1B; @@ -381,18 +408,21 @@ void bli_gemm4mb_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 2.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 2.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 2.0, 2.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -410,7 +440,7 @@ void bli_gemm4mb_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mh_cntx_init( cntx_t* cntx ) +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4MH; @@ -427,18 +457,21 @@ void bli_gemm4mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() @@ -477,9 +510,82 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemmnat_cntx_init( cntx_t* cntx ) +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_gemm_cntx_init( cntx ); + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernel associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); + } +} + +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_gemm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm_cntx_init( dt, cntx ); } void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index c70da7b36..f49744c3f 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -32,67 +32,48 @@ */ -#if 0 -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on gemm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_stage)( dim_t stage, cntx_t* cntx ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( gemm, nat ) -GENPROT( gemm, 3mh ) -GENPROT( gemm, 3m3 ) -GENPROT( gemm, 3m2 ) -GENPROT( gemm, 3m1 ) -GENPROT( gemm, 4mh ) -GENPROT( gemm, 4mb ) -GENPROT( gemm, 4m1 ) -#endif - -void bli_gemmnat_cntx_init( cntx_t* cntx ); +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemmnat_cntx_finalize( cntx_t* cntx ); -void bli_gemm3mh_cntx_init( cntx_t* cntx ); +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m3_cntx_init( cntx_t* cntx ); +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m3_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m2_cntx_init( cntx_t* cntx ); +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m2_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m1_cntx_init( cntx_t* cntx ); +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m1_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mh_cntx_init( cntx_t* cntx ); +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mb_cntx_init( cntx_t* cntx ); +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mb_cntx_finalize( cntx_t* cntx ); -void bli_gemm4m1_cntx_init( cntx_t* cntx ); +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); +void bli_gemm1m_cntx_finalize( cntx_t* cntx ); + // ----------------------------------------------------------------------------- void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ); -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ); +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ); void* bli_gemmind_cntx_init_get_func( ind_t method ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 85212ba90..4cb0bf6ba 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -36,7 +36,7 @@ // ----------------------------------------------------------------------------- -void bli_trsm3m1_cntx_init( cntx_t* cntx ) +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -57,18 +57,21 @@ void bli_trsm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for native execution. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, @@ -82,7 +85,7 @@ void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm4m1_cntx_init( cntx_t* cntx ) +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -103,18 +106,21 @@ void bli_trsm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for native execution. bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, @@ -128,9 +134,86 @@ void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsmnat_cntx_init( cntx_t* cntx ) +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_trsm_cntx_init( cntx ); + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernels associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_U_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); + } +} + +void bli_trsm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_trsm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_trsm_cntx_init( dt, cntx ); } void bli_trsmnat_cntx_finalize( cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_trsmind_cntx.h b/frame/ind/cntx/bli_trsmind_cntx.h index 3d3c883f9..49f7f0600 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.h +++ b/frame/ind/cntx/bli_trsmind_cntx.h @@ -32,29 +32,15 @@ */ -/* -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on trsm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( trsm, nat ) -GENPROT( trsm, 3m1 ) -GENPROT( trsm, 4m1 ) -*/ - -void bli_trsmnat_cntx_init( cntx_t* cntx ); +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsmnat_cntx_finalize( cntx_t* cntx ); -void bli_trsm4m1_cntx_init( cntx_t* cntx ); +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm4m1_cntx_finalize( cntx_t* cntx ); -void bli_trsm3m1_cntx_init( cntx_t* cntx ); +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm3m1_cntx_finalize( cntx_t* cntx ); +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_trsm1m_cntx_finalize( cntx_t* cntx ); + diff --git a/frame/ind/include/bli_kernel_1m_macro_defs.h b/frame/ind/include/bli_kernel_1m_macro_defs.h new file mode 100644 index 000000000..4fc0ccb06 --- /dev/null +++ b/frame/ind/include/bli_kernel_1m_macro_defs.h @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1M_MACRO_DEFS_H +#define BLIS_KERNEL_1M_MACRO_DEFS_H + + +// -- Define row access bools -------------------------------------------------- + +// gemm4m1 micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + +// -- Define default 4m1-specific kernel names --------------------------------- + +// +// Level-3 +// + +// gemm4m1 micro-kernels + +#ifndef BLIS_CGEMM1M_UKERNEL +#define BLIS_CGEMM1M_UKERNEL BLIS_CGEMM1M_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM1M_UKERNEL +#define BLIS_ZGEMM1M_UKERNEL BLIS_ZGEMM1M_UKERNEL_REF +#endif + +// gemmtrsm4m1_l micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_L_UKERNEL +#define BLIS_CGEMMTRSM1M_L_UKERNEL BLIS_CGEMMTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_L_UKERNEL +#define BLIS_ZGEMMTRSM1M_L_UKERNEL BLIS_ZGEMMTRSM1M_L_UKERNEL_REF +#endif + +// gemmtrsm4m1_u micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_U_UKERNEL +#define BLIS_CGEMMTRSM1M_U_UKERNEL BLIS_CGEMMTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_U_UKERNEL +#define BLIS_ZGEMMTRSM1M_U_UKERNEL BLIS_ZGEMMTRSM1M_U_UKERNEL_REF +#endif + +// trsm4m1_l micro-kernels + +#ifndef BLIS_CTRSM1M_L_UKERNEL +#define BLIS_CTRSM1M_L_UKERNEL BLIS_CTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_L_UKERNEL +#define BLIS_ZTRSM1M_L_UKERNEL BLIS_ZTRSM1M_L_UKERNEL_REF +#endif + +// trsm4m1_u micro-kernels + +#ifndef BLIS_CTRSM1M_U_UKERNEL +#define BLIS_CTRSM1M_U_UKERNEL BLIS_CTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_U_UKERNEL +#define BLIS_ZTRSM1M_U_UKERNEL BLIS_ZTRSM1M_U_UKERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_kernel_ind_macro_defs.h b/frame/ind/include/bli_kernel_ind_macro_defs.h index 7f43857f0..55eeb010b 100644 --- a/frame/ind/include/bli_kernel_ind_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_macro_defs.h @@ -41,9 +41,11 @@ #include "bli_kernel_4mh_macro_defs.h" #include "bli_kernel_4mb_macro_defs.h" #include "bli_kernel_4m1_macro_defs.h" +#include "bli_kernel_1m_macro_defs.h" // Storage format headers #include "bli_packm_3mis_macro_defs.h" #include "bli_packm_4mi_macro_defs.h" #include "bli_packm_rih_macro_defs.h" +#include "bli_packm_1er_macro_defs.h" diff --git a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h index b6020489e..47fbb4a28 100644 --- a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h @@ -140,6 +140,35 @@ #define BLIS_CTRSM4M1_U_UKERNEL_REF bli_ctrsm4m1_u_ukr_ref #define BLIS_ZTRSM4M1_U_UKERNEL_REF bli_ztrsm4m1_u_ukr_ref +// +// Level-3 1m +// + +// gemm1m micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_REF bli_cgemm1m_ukr_ref +#define BLIS_ZGEMM1M_UKERNEL_REF bli_zgemm1m_ukr_ref + +// gemmtrsm1m_l micro-kernels + +#define BLIS_CGEMMTRSM1M_L_UKERNEL_REF bli_cgemmtrsm1m_l_ukr_ref +#define BLIS_ZGEMMTRSM1M_L_UKERNEL_REF bli_zgemmtrsm1m_l_ukr_ref + +// gemmtrsm1m_u micro-kernels + +#define BLIS_CGEMMTRSM1M_U_UKERNEL_REF bli_cgemmtrsm1m_u_ukr_ref +#define BLIS_ZGEMMTRSM1M_U_UKERNEL_REF bli_zgemmtrsm1m_u_ukr_ref + +// trsm1m_l micro-kernels + +#define BLIS_CTRSM1M_L_UKERNEL_REF bli_ctrsm1m_l_ukr_ref +#define BLIS_ZTRSM1M_L_UKERNEL_REF bli_ztrsm1m_l_ukr_ref + +// trsm1m_u micro-kernels + +#define BLIS_CTRSM1M_U_UKERNEL_REF bli_ctrsm1m_u_ukr_ref +#define BLIS_ZTRSM1M_U_UKERNEL_REF bli_ztrsm1m_u_ukr_ref + #endif diff --git a/frame/ind/include/bli_packm_1er_macro_defs.h b/frame/ind/include/bli_packm_1er_macro_defs.h new file mode 100644 index 000000000..fe550d1c5 --- /dev/null +++ b/frame/ind/include/bli_packm_1er_macro_defs.h @@ -0,0 +1,241 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1ER_MACRO_DEFS_H +#define BLIS_KERNEL_1ER_MACRO_DEFS_H + + +// -- Define default 1e/1r-specific kernel names ------------------------------- + +// +// 1e +// + +// packm_2xk_1e kernels + +#ifndef BLIS_CPACKM_2XK_1E_KERNEL +#define BLIS_CPACKM_2XK_1E_KERNEL BLIS_CPACKM_2XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1E_KERNEL +#define BLIS_ZPACKM_2XK_1E_KERNEL BLIS_ZPACKM_2XK_1E_KERNEL_REF +#endif + +// packm_4xk_1e kernels + +#ifndef BLIS_CPACKM_4XK_1E_KERNEL +#define BLIS_CPACKM_4XK_1E_KERNEL BLIS_CPACKM_4XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1E_KERNEL +#define BLIS_ZPACKM_4XK_1E_KERNEL BLIS_ZPACKM_4XK_1E_KERNEL_REF +#endif + +// packm_6xk_1e kernels + +#ifndef BLIS_CPACKM_6XK_1E_KERNEL +#define BLIS_CPACKM_6XK_1E_KERNEL BLIS_CPACKM_6XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1E_KERNEL +#define BLIS_ZPACKM_6XK_1E_KERNEL BLIS_ZPACKM_6XK_1E_KERNEL_REF +#endif + +// packm_8xk_1e kernels + +#ifndef BLIS_CPACKM_8XK_1E_KERNEL +#define BLIS_CPACKM_8XK_1E_KERNEL BLIS_CPACKM_8XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1E_KERNEL +#define BLIS_ZPACKM_8XK_1E_KERNEL BLIS_ZPACKM_8XK_1E_KERNEL_REF +#endif + +// packm_10xk_1e kernels + +#ifndef BLIS_CPACKM_10XK_1E_KERNEL +#define BLIS_CPACKM_10XK_1E_KERNEL BLIS_CPACKM_10XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1E_KERNEL +#define BLIS_ZPACKM_10XK_1E_KERNEL BLIS_ZPACKM_10XK_1E_KERNEL_REF +#endif + +// packm_12xk_1e kernels + +#ifndef BLIS_CPACKM_12XK_1E_KERNEL +#define BLIS_CPACKM_12XK_1E_KERNEL BLIS_CPACKM_12XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1E_KERNEL +#define BLIS_ZPACKM_12XK_1E_KERNEL BLIS_ZPACKM_12XK_1E_KERNEL_REF +#endif + +// packm_14xk_1e kernels + +#ifndef BLIS_CPACKM_14XK_1E_KERNEL +#define BLIS_CPACKM_14XK_1E_KERNEL BLIS_CPACKM_14XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1E_KERNEL +#define BLIS_ZPACKM_14XK_1E_KERNEL BLIS_ZPACKM_14XK_1E_KERNEL_REF +#endif + +// packm_16xk_1e kernels + +#ifndef BLIS_CPACKM_16XK_1E_KERNEL +#define BLIS_CPACKM_16XK_1E_KERNEL BLIS_CPACKM_16XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1E_KERNEL +#define BLIS_ZPACKM_16XK_1E_KERNEL BLIS_ZPACKM_16XK_1E_KERNEL_REF +#endif + +// packm_30xk_1e kernels + +#ifndef BLIS_CPACKM_30XK_1E_KERNEL +#define BLIS_CPACKM_30XK_1E_KERNEL BLIS_CPACKM_30XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1E_KERNEL +#define BLIS_ZPACKM_30XK_1E_KERNEL BLIS_ZPACKM_30XK_1E_KERNEL_REF +#endif + +// +// 1r +// + +// packm_2xk_1r kernels + +#ifndef BLIS_CPACKM_2XK_1R_KERNEL +#define BLIS_CPACKM_2XK_1R_KERNEL BLIS_CPACKM_2XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1R_KERNEL +#define BLIS_ZPACKM_2XK_1R_KERNEL BLIS_ZPACKM_2XK_1R_KERNEL_REF +#endif + +// packm_3xk_1r kernels + +#ifndef BLIS_CPACKM_3XK_1R_KERNEL +#define BLIS_CPACKM_3XK_1R_KERNEL BLIS_CPACKM_3XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_3XK_1R_KERNEL +#define BLIS_ZPACKM_3XK_1R_KERNEL BLIS_ZPACKM_3XK_1R_KERNEL_REF +#endif + +// packm_4xk_1r kernels + +#ifndef BLIS_CPACKM_4XK_1R_KERNEL +#define BLIS_CPACKM_4XK_1R_KERNEL BLIS_CPACKM_4XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1R_KERNEL +#define BLIS_ZPACKM_4XK_1R_KERNEL BLIS_ZPACKM_4XK_1R_KERNEL_REF +#endif + +// packm_6xk_1r kernels + +#ifndef BLIS_CPACKM_6XK_1R_KERNEL +#define BLIS_CPACKM_6XK_1R_KERNEL BLIS_CPACKM_6XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1R_KERNEL +#define BLIS_ZPACKM_6XK_1R_KERNEL BLIS_ZPACKM_6XK_1R_KERNEL_REF +#endif + +// packm_8xk_1r kernels + +#ifndef BLIS_CPACKM_8XK_1R_KERNEL +#define BLIS_CPACKM_8XK_1R_KERNEL BLIS_CPACKM_8XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1R_KERNEL +#define BLIS_ZPACKM_8XK_1R_KERNEL BLIS_ZPACKM_8XK_1R_KERNEL_REF +#endif + +// packm_10xk_1r kernels + +#ifndef BLIS_CPACKM_10XK_1R_KERNEL +#define BLIS_CPACKM_10XK_1R_KERNEL BLIS_CPACKM_10XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1R_KERNEL +#define BLIS_ZPACKM_10XK_1R_KERNEL BLIS_ZPACKM_10XK_1R_KERNEL_REF +#endif + +// packm_12xk_1r kernels + +#ifndef BLIS_CPACKM_12XK_1R_KERNEL +#define BLIS_CPACKM_12XK_1R_KERNEL BLIS_CPACKM_12XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1R_KERNEL +#define BLIS_ZPACKM_12XK_1R_KERNEL BLIS_ZPACKM_12XK_1R_KERNEL_REF +#endif + +// packm_14xk_1r kernels + +#ifndef BLIS_CPACKM_14XK_1R_KERNEL +#define BLIS_CPACKM_14XK_1R_KERNEL BLIS_CPACKM_14XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1R_KERNEL +#define BLIS_ZPACKM_14XK_1R_KERNEL BLIS_ZPACKM_14XK_1R_KERNEL_REF +#endif + +// packm_16xk_1r kernels + +#ifndef BLIS_CPACKM_16XK_1R_KERNEL +#define BLIS_CPACKM_16XK_1R_KERNEL BLIS_CPACKM_16XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1R_KERNEL +#define BLIS_ZPACKM_16XK_1R_KERNEL BLIS_ZPACKM_16XK_1R_KERNEL_REF +#endif + +// packm_30xk_1r kernels + +#ifndef BLIS_CPACKM_30XK_1R_KERNEL +#define BLIS_CPACKM_30XK_1R_KERNEL BLIS_CPACKM_30XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1R_KERNEL +#define BLIS_ZPACKM_30XK_1R_KERNEL BLIS_ZPACKM_30XK_1R_KERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_packm_3mis_macro_defs.h b/frame/ind/include/bli_packm_3mis_macro_defs.h index 3abe40218..654172467 100644 --- a/frame/ind/include/bli_packm_3mis_macro_defs.h +++ b/frame/ind/include/bli_packm_3mis_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 3mis-specific kernel names -------------------------------- -// -// Level-1m -// // packm_2xk_3mis kernels diff --git a/frame/ind/include/bli_packm_4mi_macro_defs.h b/frame/ind/include/bli_packm_4mi_macro_defs.h index 2f36de349..f5a617737 100644 --- a/frame/ind/include/bli_packm_4mi_macro_defs.h +++ b/frame/ind/include/bli_packm_4mi_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 4mi-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_4mi kernels diff --git a/frame/ind/include/bli_packm_ind_pre_macro_defs.h b/frame/ind/include/bli_packm_ind_pre_macro_defs.h index ee5070e49..1bec1c5fd 100644 --- a/frame/ind/include/bli_packm_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_packm_ind_pre_macro_defs.h @@ -177,5 +177,102 @@ +// packm_2xk_1e kernels + +#define BLIS_CPACKM_2XK_1E_KERNEL_REF bli_cpackm_2xk_1e_ref +#define BLIS_ZPACKM_2XK_1E_KERNEL_REF bli_zpackm_2xk_1e_ref + +// packm_4xk_1e kernels + +#define BLIS_CPACKM_4XK_1E_KERNEL_REF bli_cpackm_4xk_1e_ref +#define BLIS_ZPACKM_4XK_1E_KERNEL_REF bli_zpackm_4xk_1e_ref + +// packm_6xk_1e kernels + +#define BLIS_CPACKM_6XK_1E_KERNEL_REF bli_cpackm_6xk_1e_ref +#define BLIS_ZPACKM_6XK_1E_KERNEL_REF bli_zpackm_6xk_1e_ref + +// packm_8xk_1e kernels + +#define BLIS_CPACKM_8XK_1E_KERNEL_REF bli_cpackm_8xk_1e_ref +#define BLIS_ZPACKM_8XK_1E_KERNEL_REF bli_zpackm_8xk_1e_ref + +// packm_10xk_1e kernels + +#define BLIS_CPACKM_10XK_1E_KERNEL_REF bli_cpackm_10xk_1e_ref +#define BLIS_ZPACKM_10XK_1E_KERNEL_REF bli_zpackm_10xk_1e_ref + +// packm_12xk_1e kernels + +#define BLIS_CPACKM_12XK_1E_KERNEL_REF bli_cpackm_12xk_1e_ref +#define BLIS_ZPACKM_12XK_1E_KERNEL_REF bli_zpackm_12xk_1e_ref + +// packm_14xk_1e kernels + +#define BLIS_CPACKM_14XK_1E_KERNEL_REF bli_cpackm_14xk_1e_ref +#define BLIS_ZPACKM_14XK_1E_KERNEL_REF bli_zpackm_14xk_1e_ref + +// packm_16xk_1e kernels + +#define BLIS_CPACKM_16XK_1E_KERNEL_REF bli_cpackm_16xk_1e_ref +#define BLIS_ZPACKM_16XK_1E_KERNEL_REF bli_zpackm_16xk_1e_ref + +// packm_30xk_1e kernels + +#define BLIS_CPACKM_30XK_1E_KERNEL_REF bli_cpackm_30xk_1e_ref +#define BLIS_ZPACKM_30XK_1E_KERNEL_REF bli_zpackm_30xk_1e_ref + +// packm_2xk_1r kernels + +#define BLIS_CPACKM_2XK_1R_KERNEL_REF bli_cpackm_2xk_1r_ref +#define BLIS_ZPACKM_2XK_1R_KERNEL_REF bli_zpackm_2xk_1r_ref + +// packm_3xk_1r kernels + +#define BLIS_CPACKM_3XK_1R_KERNEL_REF bli_cpackm_3xk_1r_ref +#define BLIS_ZPACKM_3XK_1R_KERNEL_REF bli_zpackm_3xk_1r_ref + +// packm_4xk_1r kernels + +#define BLIS_CPACKM_4XK_1R_KERNEL_REF bli_cpackm_4xk_1r_ref +#define BLIS_ZPACKM_4XK_1R_KERNEL_REF bli_zpackm_4xk_1r_ref + +// packm_6xk_1r kernels + +#define BLIS_CPACKM_6XK_1R_KERNEL_REF bli_cpackm_6xk_1r_ref +#define BLIS_ZPACKM_6XK_1R_KERNEL_REF bli_zpackm_6xk_1r_ref + +// packm_8xk_1r kernels + +#define BLIS_CPACKM_8XK_1R_KERNEL_REF bli_cpackm_8xk_1r_ref +#define BLIS_ZPACKM_8XK_1R_KERNEL_REF bli_zpackm_8xk_1r_ref + +// packm_10xk_1r kernels + +#define BLIS_CPACKM_10XK_1R_KERNEL_REF bli_cpackm_10xk_1r_ref +#define BLIS_ZPACKM_10XK_1R_KERNEL_REF bli_zpackm_10xk_1r_ref + +// packm_12xk_1r kernels + +#define BLIS_CPACKM_12XK_1R_KERNEL_REF bli_cpackm_12xk_1r_ref +#define BLIS_ZPACKM_12XK_1R_KERNEL_REF bli_zpackm_12xk_1r_ref + +// packm_14xk_1r kernels + +#define BLIS_CPACKM_14XK_1R_KERNEL_REF bli_cpackm_14xk_1r_ref +#define BLIS_ZPACKM_14XK_1R_KERNEL_REF bli_zpackm_14xk_1r_ref + +// packm_16xk_1r kernels + +#define BLIS_CPACKM_16XK_1R_KERNEL_REF bli_cpackm_16xk_1r_ref +#define BLIS_ZPACKM_16XK_1R_KERNEL_REF bli_zpackm_16xk_1r_ref + +// packm_30xk_1r kernels + +#define BLIS_CPACKM_30XK_1R_KERNEL_REF bli_cpackm_30xk_1r_ref +#define BLIS_ZPACKM_30XK_1R_KERNEL_REF bli_zpackm_30xk_1r_ref + + + #endif diff --git a/frame/ind/include/bli_packm_rih_macro_defs.h b/frame/ind/include/bli_packm_rih_macro_defs.h index 543d197a0..c5c883e7d 100644 --- a/frame/ind/include/bli_packm_rih_macro_defs.h +++ b/frame/ind/include/bli_packm_rih_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default rih-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_rih kernels diff --git a/frame/ind/misc/bli_l3_ind_opt.h b/frame/ind/misc/bli_l3_ind_opt.h new file mode 100644 index 000000000..6a0be1885 --- /dev/null +++ b/frame/ind/misc/bli_l3_ind_opt.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_IND_OPT_H +#define BLIS_L3_IND_OPT_H + +#define bli_l3_ind_recast_1m_params( dt_exec, schema_a, c, \ + m, n, k, \ + pd_a, ps_a, \ + pd_b, ps_b, \ + rs_c, cs_c ) \ +{ \ + obj_t beta; \ +\ + /* Detach the beta scalar from c so that we can test its imaginary + component. */ \ + bli_obj_scalar_detach( c, &beta ); \ +\ + /* If beta is in the real domain, and c is row- or column-stored, + then we may proceed with the optimization. */ \ + if ( bli_obj_imag_equals( &beta, &BLIS_ZERO ) && \ + !bli_is_gen_stored( rs_c, cs_c ) ) \ + { \ + dt_exec = bli_datatype_proj_to_real( dt_exec ); \ +\ + if ( bli_is_1e_packed( schema_a ) ) \ + { \ + m *= 2; \ + n *= 1; \ + k *= 2; \ + pd_a *= 2; ps_a *= 2; \ + pd_b *= 1; ps_b *= 2; \ + rs_c *= 1; cs_c *= 2; \ + } \ + else /* if ( bli_is_1r_packed( schema_a ) ) */ \ + { \ + m *= 1; \ + n *= 2; \ + k *= 2; \ + pd_a *= 1; ps_a *= 2; \ + pd_b *= 2; ps_b *= 2; \ + rs_c *= 2; cs_c *= 1; \ + } \ + } \ +} + +#endif diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c similarity index 92% rename from frame/ind/oapi/bli_l3_3m4m_oapi.c rename to frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 40348e627..cb966d71c 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -49,10 +49,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -63,7 +64,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -92,6 +93,7 @@ GENFRONT( gemm, gemm, 3m1, 1 ) GENFRONT( gemm, gemm, 4mh, 4 ) GENFRONT( gemm, gemm, 4mb, 1 ) GENFRONT( gemm, gemm, 4m1, 1 ) +GENFRONT( gemm, gemm, 1m, 1 ) // her2k GENFRONT( her2k, gemm, 3mh, 3 ) @@ -101,6 +103,7 @@ GENFRONT( her2k, gemm, 3m1, 1 ) GENFRONT( her2k, gemm, 4mh, 4 ) //GENFRONT( her2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( her2k, gemm, 4m1, 1 ) +GENFRONT( her2k, gemm, 1m, 1 ) // syr2k GENFRONT( syr2k, gemm, 3mh, 3 ) @@ -110,6 +113,7 @@ GENFRONT( syr2k, gemm, 3m1, 1 ) GENFRONT( syr2k, gemm, 4mh, 4 ) //GENFRONT( syr2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syr2k, gemm, 4m1, 1 ) +GENFRONT( syr2k, gemm, 1m, 1 ) // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -128,10 +132,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -142,7 +147,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -171,6 +176,7 @@ GENFRONT( hemm, gemm, 3m1, 1 ) GENFRONT( hemm, gemm, 4mh, 4 ) //GENFRONT( hemm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( hemm, gemm, 4m1, 1 ) +GENFRONT( hemm, gemm, 1m, 1 ) // symm GENFRONT( symm, gemm, 3mh, 3 ) @@ -180,6 +186,7 @@ GENFRONT( symm, gemm, 3m1, 1 ) GENFRONT( symm, gemm, 4mh, 4 ) //GENFRONT( symm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( symm, gemm, 4m1, 1 ) +GENFRONT( symm, gemm, 1m, 1 ) // trmm3 GENFRONT( trmm3, gemm, 3mh, 3 ) @@ -189,6 +196,7 @@ GENFRONT( trmm3, gemm, 3m1, 1 ) GENFRONT( trmm3, gemm, 4mh, 4 ) //GENFRONT( trmm3, gemm, 4mb, 1 ) // Not implemented. GENFRONT( trmm3, gemm, 4m1, 1 ) +GENFRONT( trmm3, gemm, 1m, 1 ) // -- herk/syrk ---------------------------------------------------------------- @@ -205,10 +213,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -219,7 +228,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -248,6 +257,7 @@ GENFRONT( herk, gemm, 3m1, 1 ) GENFRONT( herk, gemm, 4mh, 4 ) //GENFRONT( herk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( herk, gemm, 4m1, 1 ) +GENFRONT( herk, gemm, 1m, 1 ) // syrk GENFRONT( syrk, gemm, 3mh, 3 ) @@ -257,6 +267,7 @@ GENFRONT( syrk, gemm, 3m1, 1 ) GENFRONT( syrk, gemm, 4mh, 4 ) //GENFRONT( syrk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syrk, gemm, 4m1, 1 ) +GENFRONT( syrk, gemm, 1m, 1 ) // -- trmm --------------------------------------------------------------------- @@ -273,6 +284,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ dim_t i; \ \ @@ -285,7 +298,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -310,6 +323,7 @@ GENFRONT( trmm, gemm, 3m1, 1 ) //GENFRONT( trmm, gemm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, gemm, 4mb, 1 ) // Unimplementable. GENFRONT( trmm, gemm, 4m1, 1 ) +GENFRONT( trmm, gemm, 1m, 1 ) // -- trsm --------------------------------------------------------------------- @@ -326,6 +340,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ \ /* If the objects are in the real domain, execute the native @@ -337,7 +353,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ { \ /* NOTE: trsm cannot be implemented via any induced method that @@ -360,4 +376,5 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) +GENFRONT( trsm, trsm, 1m, 1 ) diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 62fa794fa..7f8ae194c 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -55,6 +55,7 @@ GENPROT( nat ) GENPROT( ind ) GENPROT( 3m1 ) GENPROT( 4m1 ) +GENPROT( 1m ) // diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 68b664d65..c783714fe 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -55,10 +55,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -92,10 +93,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -127,10 +129,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -161,10 +164,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -194,10 +198,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ diff --git a/frame/ind/tapi/bli_l3_ind_tapi.c b/frame/ind/tapi/bli_l3_ind_tapi.c index 1c4ba3ba9..d4425b5f6 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.c +++ b/frame/ind/tapi/bli_l3_ind_tapi.c @@ -90,6 +90,7 @@ INSERT_GENTFUNC_BASIC0( gemm3m1 ) INSERT_GENTFUNC_BASIC0( gemm4mh ) INSERT_GENTFUNC_BASIC0( gemm4mb ) INSERT_GENTFUNC_BASIC0( gemm4m1 ) +INSERT_GENTFUNC_BASIC0( gemm1m ) // -- hemm --------------------------------------------------------------------- @@ -149,6 +150,7 @@ INSERT_GENTFUNC_BASIC0( hemm3mh ) INSERT_GENTFUNC_BASIC0( hemm3m1 ) INSERT_GENTFUNC_BASIC0( hemm4mh ) INSERT_GENTFUNC_BASIC0( hemm4m1 ) +INSERT_GENTFUNC_BASIC0( hemm1m ) // -- herk --------------------------------------------------------------------- @@ -200,6 +202,7 @@ INSERT_GENTFUNCR_BASIC0( herk3mh ) INSERT_GENTFUNCR_BASIC0( herk3m1 ) INSERT_GENTFUNCR_BASIC0( herk4mh ) INSERT_GENTFUNCR_BASIC0( herk4m1 ) +INSERT_GENTFUNCR_BASIC0( herk1m ) // -- her2k -------------------------------------------------------------------- @@ -258,6 +261,7 @@ INSERT_GENTFUNCR_BASIC0( her2k3mh ) INSERT_GENTFUNCR_BASIC0( her2k3m1 ) INSERT_GENTFUNCR_BASIC0( her2k4mh ) INSERT_GENTFUNCR_BASIC0( her2k4m1 ) +INSERT_GENTFUNCR_BASIC0( her2k1m ) // -- symm --------------------------------------------------------------------- @@ -317,6 +321,7 @@ INSERT_GENTFUNC_BASIC0( symm3mh ) INSERT_GENTFUNC_BASIC0( symm3m1 ) INSERT_GENTFUNC_BASIC0( symm4mh ) INSERT_GENTFUNC_BASIC0( symm4m1 ) +INSERT_GENTFUNC_BASIC0( symm1m ) // -- syrk --------------------------------------------------------------------- @@ -367,6 +372,7 @@ INSERT_GENTFUNC_BASIC0( syrk3mh ) INSERT_GENTFUNC_BASIC0( syrk3m1 ) INSERT_GENTFUNC_BASIC0( syrk4mh ) INSERT_GENTFUNC_BASIC0( syrk4m1 ) +INSERT_GENTFUNC_BASIC0( syrk1m ) // -- syr2k -------------------------------------------------------------------- @@ -424,6 +430,7 @@ INSERT_GENTFUNC_BASIC0( syr2k3mh ) INSERT_GENTFUNC_BASIC0( syr2k3m1 ) INSERT_GENTFUNC_BASIC0( syr2k4mh ) INSERT_GENTFUNC_BASIC0( syr2k4m1 ) +INSERT_GENTFUNC_BASIC0( syr2k1m ) // -- trmm3 -------------------------------------------------------------------- @@ -485,6 +492,7 @@ INSERT_GENTFUNC_BASIC0( trmm33mh ) INSERT_GENTFUNC_BASIC0( trmm33m1 ) INSERT_GENTFUNC_BASIC0( trmm34mh ) INSERT_GENTFUNC_BASIC0( trmm34m1 ) +INSERT_GENTFUNC_BASIC0( trmm31m ) // -- trmm --------------------------------------------------------------------- @@ -534,6 +542,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trmm3m1 ) INSERT_GENTFUNC_BASIC0( trmm4m1 ) +INSERT_GENTFUNC_BASIC0( trmm1m ) // -- trsm --------------------------------------------------------------------- @@ -583,4 +592,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trsm3m1 ) INSERT_GENTFUNC_BASIC0( trsm4m1 ) +INSERT_GENTFUNC_BASIC0( trsm1m ) diff --git a/frame/ind/tapi/bli_l3_ind_tapi.h b/frame/ind/tapi/bli_l3_ind_tapi.h index 029166c6c..7aa886b3d 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.h +++ b/frame/ind/tapi/bli_l3_ind_tapi.h @@ -58,6 +58,7 @@ INSERT_GENTPROT_BASIC( gemm3m1 ) INSERT_GENTPROT_BASIC( gemm4mh ) INSERT_GENTPROT_BASIC( gemm4mb ) INSERT_GENTPROT_BASIC( gemm4m1 ) +INSERT_GENTPROT_BASIC( gemm1m ) #undef GENTPROT @@ -83,6 +84,7 @@ INSERT_GENTPROT_BASIC( hemm3mh ) INSERT_GENTPROT_BASIC( hemm3m1 ) INSERT_GENTPROT_BASIC( hemm4mh ) INSERT_GENTPROT_BASIC( hemm4m1 ) +INSERT_GENTPROT_BASIC( hemm1m ) #undef GENTPROTR @@ -107,6 +109,7 @@ INSERT_GENTPROTR_BASIC( her2k3mh ) INSERT_GENTPROTR_BASIC( her2k3m1 ) INSERT_GENTPROTR_BASIC( her2k4mh ) INSERT_GENTPROTR_BASIC( her2k4m1 ) +INSERT_GENTPROTR_BASIC( her2k1m ) #undef GENTPROTR @@ -129,6 +132,7 @@ INSERT_GENTPROTR_BASIC( herk3mh ) INSERT_GENTPROTR_BASIC( herk3m1 ) INSERT_GENTPROTR_BASIC( herk4mh ) INSERT_GENTPROTR_BASIC( herk4m1 ) +INSERT_GENTPROTR_BASIC( herk1m ) #undef GENTPROT @@ -154,6 +158,7 @@ INSERT_GENTPROT_BASIC( symm3mh ) INSERT_GENTPROT_BASIC( symm3m1 ) INSERT_GENTPROT_BASIC( symm4mh ) INSERT_GENTPROT_BASIC( symm4m1 ) +INSERT_GENTPROT_BASIC( symm1m ) #undef GENTPROT @@ -178,6 +183,7 @@ INSERT_GENTPROT_BASIC( syr2k3mh ) INSERT_GENTPROT_BASIC( syr2k3m1 ) INSERT_GENTPROT_BASIC( syr2k4mh ) INSERT_GENTPROT_BASIC( syr2k4m1 ) +INSERT_GENTPROT_BASIC( syr2k1m ) #undef GENTPROT @@ -200,6 +206,7 @@ INSERT_GENTPROT_BASIC( syrk3mh ) INSERT_GENTPROT_BASIC( syrk3m1 ) INSERT_GENTPROT_BASIC( syrk4mh ) INSERT_GENTPROT_BASIC( syrk4m1 ) +INSERT_GENTPROT_BASIC( syrk1m ) #undef GENTPROT @@ -226,6 +233,7 @@ INSERT_GENTPROT_BASIC( trmm33mh ) INSERT_GENTPROT_BASIC( trmm33m1 ) INSERT_GENTPROT_BASIC( trmm34mh ) INSERT_GENTPROT_BASIC( trmm34m1 ) +INSERT_GENTPROT_BASIC( trmm31m ) #undef GENTPROT @@ -247,6 +255,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trmm3m1 ) INSERT_GENTPROT_BASIC( trmm4m1 ) +INSERT_GENTPROT_BASIC( trmm1m ) #undef GENTPROT @@ -268,4 +277,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trsm3m1 ) INSERT_GENTPROT_BASIC( trsm4m1 ) +INSERT_GENTPROT_BASIC( trsm1m ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c new file mode 100644 index 000000000..f686aa7ac --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t row_pref = !col_pref; \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride OR if for some reason the storage of c is not the + preferred storage of the micro-kernel, then we compute the + alpha*a*b product into temporary storage and then accumulate that + result into c afterwards. */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h index d7d5a258f..9b2dd5e5a 100644 --- a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h +++ b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h @@ -55,4 +55,5 @@ INSERT_GENTPROTCO_BASIC( gemm3m1_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mh_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mb_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4m1_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemm1m_ukr_ref ) diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c new file mode 100644 index 000000000..7d746304c --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -0,0 +1,244 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a1x, \ + ctype* restrict a11, \ + ctype* restrict bx1, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ +\ + PASTECH(ch,trsm_ukr_ft) \ + ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ +\ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ + const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ +\ + ctype bt[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_bt; \ + inc_t cs_bt; \ +\ + inc_t rs_bt_r; \ + inc_t cs_bt_r; \ +\ + const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype_r* restrict a1x_r = ( ctype_r* )a1x; \ +\ + ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + const ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ + const ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* b_use; \ + inc_t rs_b_use; \ + inc_t cs_b_use; \ +\ +\ + /* Handle alphas with non-zero imaginary components. */ \ + /* NOTE: This branch should never execute because alphas with + non-zero imaginary components should be applied during + packing, and so the only alphas we should see here are + those exclusively in the real domain, either because the + value originally had no imaginary compoent (e.g. 4.0) or + because a 1.0 was sent in as a placeholder since the alpha + was applied during packing. */ \ + if ( 0 ) \ + if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + { \ + bli_abort(); \ +\ +/* + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + const inc_t ld_b = rs_b; \ +\ + PASTEMAC(ch,scal1ms_mxn)( schema_b, \ + mr, \ + nr, \ + alpha, \ + b11, rs_b, cs_b, ld_b ); \ +\ + alpha_r = *one_r; \ +*/ \ + } \ +\ +\ + { \ + /* Set the strides for the temporary bt matrix based on the native + real domain micro-kernel storage preferences. */ \ + if ( col_pref ) { rs_bt = 1; cs_bt = mr; \ + rs_bt_r = 1; cs_bt_r = mr_r; } \ + else { rs_bt = nr; cs_bt = 1; \ + rs_bt_r = nr_r; cs_bt_r = 1; } \ +\ + b_use = ( ctype_r* )bt; \ + rs_b_use = rs_bt_r; \ + cs_b_use = cs_bt_r; \ + } \ +\ +\ + /* Since b11 is stored in the 1e or 1r schema, we cannot update it + directly, and instead must compute the matrix product in a local + temporary microtile and then accumulate it into b11 according to + its schema. */ \ +\ +\ + /* lower: bt = -1.0 * a10 * b01; + upper: bt = -1.0 * a12 * b21; */ \ + rgemm_ukr \ + ( \ + k2, \ + minus_one_r, \ + a1x_r, \ + bx1_r, \ + zero_r, \ + b_use, rs_b_use, cs_b_use, \ + data, \ + cntx \ + ); \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t ld_b = rs_b; \ +\ + ctype* restrict b11_ri = ( ctype* )b11; \ + ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ + ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ +\ + PASTEMAC(ch,sets)( -*beta11_i, \ + *beta11_r, *beta11_ir ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t ld_b = rs_b; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = cs_b; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ + ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + ctrsm_vir_ukr \ + ( \ + a11, \ + b11, \ + c11, rs_c, cs_c, \ + data, \ + cntx \ + ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h index 7ec51ad8d..615482e41 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h @@ -55,3 +55,6 @@ INSERT_GENTPROTCO_BASIC( gemmtrsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c new file mode 100644 index 000000000..92da659ca --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -0,0 +1,448 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ + ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ + ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ + ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ + ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a10t_ri = a_ri + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_r = b_r + (0 )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_i = b_i + (0 )*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_r = B0_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_i = B0_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha10_ri = a10t_ri + (l )*cs_a; \ + ctype_r* restrict alpha10_r = &PASTEMAC(ch,real)( *alpha10_ri ); \ + ctype_r* restrict alpha10_i = &PASTEMAC(ch,imag)( *alpha10_ri ); \ + ctype_r* restrict beta01_r = b01_r + (l )*rs_b2; \ + ctype_r* restrict beta01_i = b01_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_l_ukr_ref ) + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ + ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ + ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ + ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a12t_ri = a_ri + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_r = b_r + (i+1)*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_i = b_i + (i+1)*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_r = B2_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_i = B2_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha12_ri = a12t_ri + (l )*cs_a; \ + ctype_r* restrict alpha12_r = &PASTEMAC(ch,real)( *alpha12_ri ); \ + ctype_r* restrict alpha12_i = &PASTEMAC(ch,imag)( *alpha12_ri ); \ + ctype_r* restrict beta21_r = b21_r + (l )*rs_b2; \ + ctype_r* restrict beta21_i = b21_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h index abad11caf..77d502a3c 100644 --- a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h @@ -51,3 +51,6 @@ INSERT_GENTPROTCO_BASIC( trsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c deleted file mode 100644 index 5fc8e012c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ - ctype_r* restrict a10_ri = ( ctype_r* )a10 + 2*is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ - ctype_r* restrict b01_ri = ( ctype_r* )b01 + 2*is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.ri * b01.ri - a10.r * b01.r - a10.i * b01.i ); */ \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* ab.r = a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_r, \ - b01_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_ri, b01_ri, *data ); \ -\ - /* ab.i = a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.ri * b01.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_ri, \ - b01_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r after", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i after", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_r", k, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_i", k, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c deleted file mode 100644 index 9d82ba8c9..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ - ctype_r* restrict a12_ri = ( ctype_r* )a12 + 2*is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ - ctype_r* restrict b21_ri = ( ctype_r* )b21 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.i - ( a12.ri * b21.ri - a12.r * b21.r - a12.i * b21.i ); */ \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* ab.r = a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_r, \ - b21_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_ri, b21_ri, *data ); \ -\ - /* ab.i = a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.ri * b21.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_ri, \ - b21_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c deleted file mode 100644 index c979d5cbf..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -printf( "gemmtrsm4m1_l_ukr: is_a = %lu is_b = %lu\n", is_a, is_b ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_r", m, k+m, \ - a10_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_i", m, k+m, \ - a10_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.r * b01.i + a10.i * b01.r ); */ \ -\ - bli_auxinfo_set_next_ab( a10_r, b01_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.r * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a10.i * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_i, \ - b01_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r post-gemm", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i post-gemm", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r after", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i after", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c deleted file mode 100644 index 9d1d1927e..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_r", m, k+m, \ - a11_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_i", m, k+m, \ - a11_r+is_a, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_r", k+m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_i", k+m, n, \ - b11_r+is_b, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ - \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.r - ( a12.r * b21.i + a12.i * b21.r ); */ \ -\ - bli_auxinfo_set_next_ab( a12_r, b21_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.r * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a12.i * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_i, \ - b21_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c deleted file mode 100644 index 62fff68e0..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c deleted file mode 100644 index af916ed33..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_u_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c deleted file mode 100644 index 06274d95c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_r", m, m, \ - a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_i", m, m, \ - a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r after", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i after", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c deleted file mode 100644 index 5711dc8ce..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_u_ukr_ref ) - diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index ad2bb0b40..9cccce228 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -60,7 +60,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -96,7 +96,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, m ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -139,7 +139,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -186,7 +186,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -281,7 +281,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -319,7 +319,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -359,7 +359,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 9e982032f..0b13b8eb1 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -36,7 +36,7 @@ # Makefile # # Field G. Van Zee -# +# # Makefile for standalone BLIS test drivers. # @@ -189,6 +189,7 @@ D3M1 := -DIND=BLIS_3M1 D4MHW := -DIND=BLIS_4MH D4M1B := -DIND=BLIS_4M1B D4M1A := -DIND=BLIS_4M1A +D1M := -DIND=BLIS_1M DNAT := -DIND=BLIS_NAT # Implementation string @@ -199,6 +200,7 @@ STR_3M1 := -DSTR=\"3m1\" STR_4MHW := -DSTR=\"4mhw\" STR_4M1B := -DSTR=\"4m1b\" STR_4M1A := -DSTR=\"4m1a\" +STR_1M := -DSTR=\"1m\" STR_NAT := -DSTR=\"asm\" STR_OBL := -DSTR=\"openblas\" STR_MKL := -DSTR=\"mkl\" @@ -209,13 +211,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=80 \ - -DP_END=2000 \ - -DP_INC=80 +PDEF_ST := -DP_BEGIN=100 \ + -DP_END=1000 \ + -DP_INC=100 -PDEF_MT := -DP_BEGIN=80 \ - -DP_END=4000 \ - -DP_INC=80 +PDEF_MT := -DP_BEGIN=100 \ + -DP_END=2000 \ + -DP_INC=100 @@ -259,6 +261,8 @@ blis-gemm-st: \ test_zgemm_4m1b_blis_st.x \ test_cgemm_4m1a_blis_st.x \ test_zgemm_4m1a_blis_st.x \ + test_cgemm_1m_blis_st.x \ + test_zgemm_1m_blis_st.x \ test_cgemm_asm_blis_st.x \ test_zgemm_asm_blis_st.x @@ -280,6 +284,8 @@ blis-gemm-mt: \ test_zgemm_4m1b_blis_mt.x \ test_cgemm_4m1a_blis_mt.x \ test_zgemm_4m1a_blis_mt.x \ + test_cgemm_1m_blis_mt.x \ + test_zgemm_1m_blis_mt.x \ test_cgemm_asm_blis_mt.x \ test_zgemm_asm_blis_mt.x @@ -411,6 +417,19 @@ test_z%_4m1a_blis_mt.o: test_%.c test_c%_4m1a_blis_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_MT) -c $< -o $@ +# blis 1m +test_z%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_c%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_z%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + +test_c%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + # blis asm test_d%_asm_blis_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index bb65a5db5..794f0ba00 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -75,16 +75,16 @@ test_ops_r="${l3_ops}" if [ ${sys} = "blis" ]; then #test_impls="openblas mkl 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" - test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" elif [ ${sys} = "stampede" ]; then - test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" #test_impls="openblas mkl asm_blis" elif [ ${sys} = "wahlberg" ]; then - test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" test_impls="openblas acml asm_blis" fi diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index c8e9ec5d5..7b16f584f 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -79,23 +79,19 @@ int main( int argc, char** argv ) k_input = -1; #if 0 - num_t dt_real = bli_datatype_proj_to_real( DT ); + cntx_t cntx; - bli_gemm_cntx_init( &cntx ); + // Initialize a context for the current induced method and datatype. + bli_gemmind_cntx_init( IND, dt, &cntx ); - // Extract the kc blocksize for the requested datatype and its - // real analogue. - dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); - dim_t kc_real = bli_cntx_get_blksz_def_dt( dt_real, BLIS_KC, &cntx ); + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + +#elif 0 + + k_input = 256; - // Assign the k dimension depending on which implementation is - // being tested. Note that the BLIS_NAT case handles the real - // domain cases as well as native complex. - if ( IND == BLIS_NAT ) k_input = kc; - else if ( IND == BLIS_3M1 ) k_input = kc_real / 3; - else if ( IND == BLIS_4M1A ) k_input = kc_real / 2; - else k_input = kc_real; #endif // Choose the char corresponding to the requested datatype. @@ -154,7 +150,7 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transb, b ); bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( -(1.0/1.0), 0.0, &beta ); + bli_setsc( (1.0/1.0), 0.0, &beta ); bli_copym( &c, &c_save ); diff --git a/testsuite/input.general b/testsuite/input.general index 0bf9053bd..9dba50df6 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -9,7 +9,7 @@ # 1 # Number of repeats per experiment (best result is reported) -c # Matrix storage scheme(s) to test: +rc # Matrix storage scheme(s) to test: # 'c' = col-major storage; 'g' = general stride storage; # 'r' = row-major storage c # Vector storage scheme(s) to test: @@ -26,7 +26,7 @@ sdcz # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex 100 # Problem size: first to test -400 # Problem size: maximum to test +500 # Problem size: maximum to test 100 # Problem size: increment between experiments # Complex level-3 implementations to test 1 # 3mh ('1' = enable; '0' = disable) @@ -36,6 +36,7 @@ sdcz # Datatype(s) to test: 1 # 4mh ('1' = enable; '0' = disable) 1 # 4m1b ('1' = enable; '0' = disable) 1 # 4m1a ('1' = enable; '0' = disable) +1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 6f5515127..41c0b9160 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -168,7 +168,7 @@ void libblis_test_axpy2v_experiment cntx_t cntx; // Initialize a context. - bli_axpy2v_cntx_init( &cntx ); + bli_axpy2v_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 706359ca4..8da15c315 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -166,7 +166,7 @@ void libblis_test_axpyf_experiment cntx_t cntx; // Initialize a context. - bli_axpyf_cntx_init( &cntx ); + bli_axpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 36b88cc2f..6c1440e95 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -171,7 +171,7 @@ void libblis_test_dotaxpyv_experiment cntx_t cntx; // Initialize a context. - bli_dotaxpyv_cntx_init( &cntx ); + bli_dotaxpyv_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index dd83dc49e..a7abdba87 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -176,7 +176,7 @@ void libblis_test_dotxaxpyf_experiment cntx_t cntx; // Initialize a context. - bli_dotxaxpyf_cntx_init( &cntx ); + bli_dotxaxpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 3a29b41b7..8adec7c1d 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -168,7 +168,7 @@ void libblis_test_dotxf_experiment cntx_t cntx; // Initialize a context. - bli_dotxf_cntx_init( &cntx ); + bli_dotxf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index 222dca395..89a8bd7c3 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -259,8 +259,6 @@ void libblis_test_gemm_impl { case BLIS_TEST_SEQ_FRONT_END: bli_gemm( alpha, a, b, beta, c ); - //bli_gemm4m( alpha, a, b, beta, c ); - //bli_gemm3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 514fdf66a..f418ac6e5 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -173,7 +173,7 @@ void libblis_test_gemm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_gemm_cntx_init( &cntx ); + bli_gemm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index afd436d7f..172ff053a 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -198,7 +198,7 @@ void libblis_test_gemmtrsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index bd14d13b4..993c134b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -427,6 +427,10 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); + // Read whether to enable 1m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_1M ]) ); + // Read whether to native (complex) execution. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); @@ -597,8 +601,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) //char int_type_size_str[8]; gint_t int_type_size; ind_t im; - cntx_t cntx_s; - cntx_t* cntx = &cntx_s; + cntx_t cntx_local; + cntx_t cntx_local_c; + cntx_t cntx_local_z; + cntx_t* cntx = &cntx_local; + cntx_t* cntx_c = &cntx_local_c; + cntx_t* cntx_z = &cntx_local_z; // If bli_info_get_int_type_size() returns 32 or 64, the size is forced. // Otherwise, the size is chosen automatically. We query the result of @@ -721,7 +729,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmnat_cntx_init( cntx ); + // Initialize a context for the gemm family, assuming native execution. + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_gemmnat_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "level-3 blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d %7d %7d\n", @@ -825,42 +836,43 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_init( im, cntx ); + bli_gemmind_cntx_init( im, BLIS_SCOMPLEX, cntx_c ); + bli_gemmind_cntx_init( im, BLIS_DCOMPLEX, cntx_z ); libblis_test_fprintf_c( os, "level-3 blocksizes c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "micro-kernel types c z\n" ); libblis_test_fprintf_c( os, " gemm %7s %7s\n", @@ -880,14 +892,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_trsm_u_ukr_impl_string( im, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_finalize( im, cntx ); + bli_gemmind_cntx_finalize( im, cntx_c ); + bli_gemmind_cntx_finalize( im, cntx_z ); } bli_ind_disable_all(); // We use hemv's context because we know it is initialized with all of the fields // we will be outputing. - bli_hemv_cntx_init( cntx ); + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_hemv_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS misc. other info ---\n" ); @@ -955,6 +970,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); + libblis_test_fprintf_c( os, " 1m? %u\n", params->ind_enable[ BLIS_1M ] ); libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index bf5f2d6bd..e7ccb4b43 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -178,7 +178,7 @@ void libblis_test_trsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Fix m and n to MR and NR, respectively. m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx ); diff --git a/version b/version index 0c62199f1..566318cf2 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1 +0.2.1-82 From 1c732d3ddc4ac0861d3b0e0dd15eb7e071615502 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 25 Jan 2017 16:25:46 -0600 Subject: [PATCH 03/64] Added 1m-specific APIs for bp, pb gemm algorithms. Details: - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the body of bli_gemm_cntl_create() replaced with a call to the former. - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, bli_cntl_free() can check if the thread parameter is NULL, and if so, call the latter, and otherwise call the former. - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in terms of bli_gemm1mxx_cntx_init(), which behaves the same as bli_gemm1m_cntx_init() did before, except that an extra bool parameter (is_pb) is used to support both bp and pb algorithms (including to support the anti-preference field described below). - Added support for "anti-preference" in context. The anti_pref field, when true, will toggle the boolean return value of routines such as bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of causing BLIS to transpose the operation to achieve disagreement (rather than agreement) between the storage of C and the micro-kernel output preference. This disagreement is needed for panel-block implementations, since they induce a transposition of the suboperation immediately before the macro-kernel is called, which changes the apparent storage of C. For now, anti-preference is used only with the pb algorithm for 1m (and not with any other non-1m implementation). - Defined new functions, bli_cntx_l3_ukr_eff_prefers_storage_of() bli_cntx_l3_ukr_eff_dislikes_storage_of() bli_cntx_l3_nat_ukr_eff_prefers_storage_of() bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() which are identical to their non-"eff" (effectively) counterparts except that they take the anti-preference field of the context into account. - Explicitly initialize the anti-pref field to FALSE in bli_gks_cntx_set_l3_nat_ukr_prefs(). - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel in terms of the existing block-panel macro-kernel _ker_var2(). This technique requires inducing transposes on all operands and swapping the A and B. - Changed bli_obj_induce_trans() macro so that pack-related fields are also changed to reflect the induced transposition. - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily specify the 1m algorithm (block-panel or panel-block). - Renamed the following cntx_t-related macros: bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() and updated all instantiations. Also updated the field names in the cntx_t struct. - Comment updates. --- frame/1m/packm/bli_packm_init.c | 4 +- frame/3/bli_l3_cntl.c | 4 +- frame/3/bli_l3_cntx.c | 10 +- frame/3/gemm/bli_gemm_cntl.c | 104 +++++++++- frame/3/gemm/bli_gemm_cntl.h | 14 ++ frame/3/gemm/bli_gemm_front.c | 2 +- frame/3/gemm/bli_gemm_ker_var1.c | 56 ++++++ frame/3/gemm/bli_gemm_var.h | 1 + frame/base/bli_cntl.c | 52 ++++- frame/base/bli_cntl.h | 13 ++ frame/base/bli_cntx.c | 102 ++++++++-- frame/base/bli_cntx.h | 94 ++++++--- frame/base/bli_gks.c | 3 + frame/include/bli_obj_macro_defs.h | 16 ++ frame/include/bli_type_defs.h | 8 +- frame/ind/cntx/bli_gemmind_cntx.c | 133 ++++++++----- frame/ind/cntx/bli_gemmind_cntx.h | 3 + frame/ind/cntx/bli_trsmind_cntx.c | 24 +-- frame/ind/oapi/bli_l3_1mbppb_oapi.c | 85 ++++++++ frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 8 + frame/ind/oapi/bli_l3_ind_oapi.h | 14 ++ frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 130 +++++++----- .../ukernels/gemm/bli_gemm1m_ukr_ref.c.prev | 188 ++++++++++++++++++ .../ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c | 2 +- frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c | 4 +- 25 files changed, 891 insertions(+), 183 deletions(-) create mode 100644 frame/3/gemm/bli_gemm_ker_var1.c create mode 100644 frame/ind/oapi/bli_l3_1mbppb_oapi.c create mode 100644 frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index ccf88f3cb..d828f698d 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -121,11 +121,11 @@ siz_t bli_packm_init if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { - schema = bli_cntx_get_pack_schema_a( cntx ); + schema = bli_cntx_get_pack_schema_a_block( cntx ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { - schema = bli_cntx_get_pack_schema_b( cntx ); + schema = bli_cntx_get_pack_schema_b_panel( cntx ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index a8dfee1ba..4fe3fe7f5 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -70,8 +70,8 @@ void bli_l3_cntl_create_if else { // If the user provided a control tree, create a copy and use it - // instead (so that it can be used to cache things like pack mem_t - // entries). + // instead (so that threads can use its local tree as a place to + // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); } } diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 8b4b01572..161e68160 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_gemm_cntx_finalize( cntx_t* cntx ) @@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_trsm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b3494b174..775ca2544 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create opid_t family ) { - void* macro_kernel_p = bli_gemm_ker_var2; + return bli_gemmbp_cntl_create( family ); +} +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var2; // Change the macro-kernel if the operation family is herk or trmm. if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; @@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( - bli_gemm_packa, + bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, BLIS_MR, BLIS_KR, @@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create ( - bli_gemm_packb, + bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, BLIS_KR, BLIS_NR, @@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create return gemm_cntl_vl_mm; } +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var1; + + // Change the macro-kernel if the operation family is herk or trmm. + //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; + + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); + + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_ub_ke + ); + + // Create a node for packing matrix A (which is really the right-hand + // operand "B"). + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, // pack the right-hand operand + bli_packm_blk_var1, + BLIS_KR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_pb_ub + ); + + // Create a node for partitioning the n dimension by MC. + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var2, + gemm_cntl_packb + ); + + // Create a node for packing matrix B (which is really the left-hand + // operand "A"). + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, // pack the left-hand operand + bli_packm_blk_var1, + BLIS_NR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_pb + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packa + ); + + // Create a node for partitioning the m dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var1, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; +} + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5b985327c..6da6cd768 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create opid_t family ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ); + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ); + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index d3b11c43d..acceabbe8 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -73,7 +73,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_eff_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c new file mode 100644 index 000000000..7b485a6b7 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var1.c @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ker_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Implement _ker_var1() in terms of _ker_var2() by transposing the + // entire suboperation (which also requires swapping A and B). + + bli_obj_induce_trans( *a ); + bli_obj_induce_trans( *b ); + bli_obj_induce_trans( *c ); + + bli_gemm_ker_var2( b, a, c, cntx, cntl, thread ); +} + diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index c66587fda..88412c3d8 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_packa ) GENPROT( gemm_packb ) +GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 2b45a5de3..cac290da9 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -97,6 +97,16 @@ void bli_cntl_free cntl_t* cntl, thrinfo_t* thread ) +{ + if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread ); + else bli_cntl_free_wo_thrinfo( cntl ); +} + +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; @@ -112,7 +122,7 @@ void bli_cntl_free { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. @@ -122,8 +132,8 @@ void bli_cntl_free } // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the current thread - // is chief for its group, and only if the mem_t is allocated. + // broker from which it originated, but only if the mem_t entry is + // allocated, and only if the current thread is chief for its group. if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { @@ -134,6 +144,42 @@ void bli_cntl_free bli_cntl_obj_free( cntl ); } +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free_wo_thrinfo( cntl_sub_node ); + } + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the mem_t entry is + // allocated. + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 7b6000bb9..fd0413f4f 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -75,12 +75,25 @@ void bli_cntl_obj_clear cntl_t* cntl ); +// ----------------------------------------------------------------------------- + void bli_cntl_free ( cntl_t* cntl, thrinfo_t* thread ); +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ); + cntl_t* bli_cntl_copy ( cntl_t* cntl diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index e4299eb49..f8cdf1fc4 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx ) return bli_cntx_method( cntx ); } -pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ) { - return bli_cntx_schema_a( cntx ); + return bli_cntx_schema_a_block( cntx ); } -pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ) { - return bli_cntx_schema_b( cntx ); + return bli_cntx_schema_b_panel( cntx ); +} + +pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ) +{ + return bli_cntx_schema_c_panel( cntx ); +} + +bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ) +{ + return bli_cntx_anti_pref( cntx ); } #endif @@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method, bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c( pack_t schema_c, +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ) +{ + bli_cntx_set_schema_c_panel( schema_c, cntx ); +} + +#if 0 +void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, cntx_t* cntx ) { - bli_cntx_set_schema_c( schema_c, cntx ); + bli_cntx_set_anti_pref( anti_pref, cntx ); } +#endif void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, dim_t m, dim_t n, dim_t k ) @@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +// ----------------------------------------------------------------------------- + bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) @@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 9c97c3312..a76cdd329 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + bool_t anti_pref; + dim_t* thrloop; membrk_t* membrk; @@ -113,26 +115,30 @@ typedef struct cntx_s \ ( (cntx)->method ) -#define bli_cntx_schema_a( cntx ) \ +#define bli_cntx_schema_a_block( cntx ) \ \ - ( (cntx)->schema_a ) + ( (cntx)->schema_a_block ) -#define bli_cntx_schema_b( cntx ) \ +#define bli_cntx_schema_b_panel( cntx ) \ \ - ( (cntx)->schema_b ) + ( (cntx)->schema_b_panel ) -#define bli_cntx_schema_c( cntx ) \ +#define bli_cntx_schema_c_panel( cntx ) \ \ - ( (cntx)->schema_c ) + ( (cntx)->schema_c_panel ) -#define bli_cntx_membrk( cntx ) \ +#define bli_cntx_anti_pref( cntx ) \ \ - ( (cntx)->membrk ) + ( (cntx)->anti_pref ) #define bli_cntx_thrloop( cntx ) \ \ ( (cntx)->thrloop ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + #if 1 #define bli_cntx_jc_way( cntx ) \ \ @@ -211,24 +217,24 @@ typedef struct cntx_s (cntx_p)->method = _method; \ } -#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ +#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a_block = _schema_a_block; \ } -#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ +#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b_panel = _schema_b_panel; \ } -#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ +#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c_panel = _schema_c_panel; \ } -#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \ { \ - (cntx_p)->membrk = _membrk; \ + (cntx_p)->anti_pref = _anti_pref; \ } #define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ @@ -241,6 +247,11 @@ typedef struct cntx_s (cntx_p)->thrloop[ BLIS_KR ] = 1; \ } +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -323,13 +334,17 @@ typedef struct cntx_s \ bli_cntx_method( cntx ) -#define bli_cntx_get_pack_schema_a( cntx ) \ +#define bli_cntx_get_pack_schema_a_block( cntx ) \ \ - bli_cntx_schema_a( cntx ) + bli_cntx_schema_a_block( cntx ) -#define bli_cntx_get_pack_schema_b( cntx ) \ +#define bli_cntx_get_pack_schema_b_panel( cntx ) \ \ - bli_cntx_schema_b( cntx ) + bli_cntx_schema_b_panel( cntx ) + +#define bli_cntx_get_pack_schema_c_panel( cntx ) \ +\ + bli_cntx_schema_c_panel( cntx ) #define bli_cntx_get_membrk( cntx ) \ \ @@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); // l1vkr_t ker_id, // cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ); +//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ); dim_t bli_cntx_get_num_threads( cntx_t* cntx ); dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); @@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, cntx_t* cntx ); -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_c( pack_t schema_c, - cntx_t* cntx ); +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ); +//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, +// cntx_t* cntx ); void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, @@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); @@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 32f99a832..2ada1556e 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ]; bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref ); + + // Explicitly set the anti-preference to FALSE. + bli_cntx_set_anti_pref( FALSE, cntx ); } diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 0d5992900..a7a69243e 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -877,6 +877,12 @@ bli_obj_width_stored( obj ) (obj).n_panel = n0; \ } +#define bli_obj_set_panel_dims( m0, n0, obj ) \ +{ \ + bli_obj_set_panel_length( m0, obj ); \ + bli_obj_set_panel_width( n0, obj ); \ +} + #define bli_obj_set_panel_dim( panel_dim, obj ) \ { \ (obj).pd = panel_dim; \ @@ -985,6 +991,7 @@ bli_obj_width_stored( obj ) #define bli_obj_induce_trans( obj ) \ { \ { \ + /* Induce transposition among basic fields. */ \ dim_t m_ = bli_obj_length( obj ); \ dim_t n_ = bli_obj_width( obj ); \ inc_t rs_ = bli_obj_row_stride( obj ); \ @@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj ) \ if ( bli_obj_is_upper_or_lower( obj ) ) \ bli_obj_toggle_uplo( obj ); \ +\ + /* Induce transposition among packed fields. */ \ + dim_t m_padded_ = bli_obj_padded_length( obj ); \ + dim_t n_padded_ = bli_obj_padded_width( obj ); \ + dim_t m_panel_ = bli_obj_panel_length( obj ); \ + dim_t n_panel_ = bli_obj_panel_width( obj ); \ +\ + bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \ + bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \ \ /* Note that this macro DOES NOT touch the transposition bit! If the calling code is using this macro to handle an object whose diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c4cfd3514..1a120d5da 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -975,9 +975,11 @@ typedef struct cntx_s opid_t family; ind_t method; - pack_t schema_a; - pack_t schema_b; - pack_t schema_c; + pack_t schema_a_block; + pack_t schema_b_panel; + pack_t schema_c_panel; + + bool_t anti_pref; dim_t thrloop[ BLIS_NUM_LOOPS ]; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index ce40bb105..5b7a70c3c 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - BLIS_PACKED_COL_PANELS_3MS, - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx ); } void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); } } @@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx ); } } @@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 3 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } } @@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) +{ + // Default to context for block-panel algorithm. + bli_gemm1mbp_cntx_init( dt, cntx ); +} + +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, FALSE, cntx ); +} + +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, TRUE, cntx ); +} + +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) { const ind_t method = BLIS_1M; @@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) // Initialize the context with packm-related kernels. bli_packm_cntx_init( dt, cntx ); + // Initialize the blocksizes according to the micro-kernel preference as + // well as the algorithm. if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_c_bp, 1m_r_pb. + + // Set the pack_t schemas for the c_bp or r_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_r_bp, 1m_c_pb. + + // Set the pack_t schemas for the r_bp or c_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); } + + // Set the anti-preference field to TRUE when executing a panel-block + // algorithm, and FALSE otherwise. This will cause higher-level generic + // code to establish (if needed) disagreement between the storage of C and + // the micro-kernel output preference so that the two will come back into + // agreement in the panel-block macro-kernel (which implemented in terms + // of the block-panel macro-kernel with some induced transpositions). + bli_cntx_set_anti_pref( is_pb, cntx ); } void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index f49744c3f..ea47968b1 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -65,6 +65,9 @@ void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ); void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm1m_cntx_finalize( cntx_t* cntx ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 4cb0bf6ba..a13d0d05a 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, + BLIS_PACKED_COL_PANELS_3MI, + cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, + BLIS_PACKED_COL_PANELS_4MI, + cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); } } diff --git a/frame/ind/oapi/bli_l3_1mbppb_oapi.c b/frame/ind/oapi/bli_l3_1mbppb_oapi.c new file mode 100644 index 000000000..e91f27ea2 --- /dev/null +++ b/frame/ind/oapi/bli_l3_1mbppb_oapi.c @@ -0,0 +1,85 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// -- gemmbp/gemmpb ------------------------------------------------------------ + +#undef GENFRONT +#define GENFRONT( opname, imeth, alg ) \ +\ +void PASTEMAC2(opname,imeth,alg) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *c ); \ + cntx_t cntx; \ + cntl_t* cntl_p; \ +\ + /* If the objects are in the real domain, execute the native + implementation. */ \ + if ( bli_obj_is_real( *c ) ) \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \ + return; \ + } \ +\ + /* Initialize a local 1m context for the current algorithm (bp or pb). */ \ + PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \ +\ + /* Create a control tree for the current algorithm (bp or pb). */ \ + cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \ +\ + /* Invoke the operation's front end using the context and control + tree we just created. */ \ + PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \ +\ + /* Free the control tree. Since the implementation will only make + copies of it (and not use it directly) we do not need to supply + a thread object. */ \ + bli_cntl_free( cntl_p, NULL ); \ +\ + /* Finalize the local context. */ \ + PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \ +} + +// gemm +GENFRONT( gemm, 1m, bp ) +GENFRONT( gemm, 1m, pb ) + diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index cb966d71c..36281f543 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \ PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ return; \ } \ +\ + /* A temporary hack to easily specify the 1m algorithm (block-panel or + panel-block). */ \ + if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ + { \ + bli_gemm1mbp( alpha, a, b, beta, c ); \ + return; \ + } \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 7f8ae194c..f5907d414 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 ) GENPROT_NO2OP( 4mh ) GENPROT_NO2OP( 4mb ) + +// +// Generate object-based prototypes for 1m methods that specify an algorithm +// (e.g., block-panel or panel-block). +// + +#undef GENPROT +#define GENPROT( imeth, alg ) \ +\ +void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \ + +GENPROT( 1m, bp ) +GENPROT( 1m, pb ) + diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index f686aa7ac..ff23a36f4 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const bool_t row_pref = !col_pref; \ + /*const bool_t row_pref = !col_pref;*/ \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \ ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ -\ - ctype_r beta_use; \ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ \ ctype_r* c_use; \ inc_t rs_c_use; \ @@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ \ /* If beta has a non-zero imaginary component OR if c is stored with - general stride OR if for some reason the storage of c is not the - preferred storage of the micro-kernel, then we compute the - alpha*a*b product into temporary storage and then accumulate that - result into c afterwards. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ - else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ \ if ( using_ct ) \ { \ + /* In the atypical cases, we compute the result into temporary + workspace ct and then accumulated it back to c at the end. */ \ +\ /* Set the strides of ct based on the preference of the underlying native real domain gemm micro-kernel. Note that we set the ct strides in units of complex elements. */ \ if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ else { rs_ct = nr; cs_ct = 1; } \ \ - beta_use = *zero_r; \ c_use = ( ctype_r* )ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ - } \ - else \ - { \ - /* In a typical case, we use the real part of beta and accumulate - directly into the output matrix c. */ \ - beta_use = beta_r; \ - c_use = ( ctype_r* )c; \ - rs_c_use = rs_c; \ - cs_c_use = cs_c; \ - } \ \ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ \ - /* Convert the strides from being in units of complex elements to - be in units of real elements. Note that we don't need to check for - general storage here because that case corresponds to the scenario - where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ - if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ - else rs_c_use *= 2; \ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ \ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ \ - /* The following gemm micro-kernel call implements the 1m method, - which induces a complex matrix multiplication by calling the - real matrix micro-kernel on micro-panels that have been packed - according to the 1e and 1r formats. */ \ -\ - /* c = beta * c + alpha_r * a * b; */ \ - rgemm_ukr \ - ( \ - k2, \ - alpha_r, \ - a_r, \ - b_r, \ - &beta_use, \ - c_use, rs_c_use, cs_c_use, \ - data, \ - cntx \ - ); \ -\ -\ - /* If necessary, accumulate the final result in ct back to c. */ \ - if ( using_ct ) \ - { \ dim_t i, j; \ \ + /* Accumulate the final result in ct back to c. */ \ for ( j = 0; j < nr; ++j ) \ for ( i = 0; i < mr; ++i ) \ { \ @@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ + else \ + { \ + /* In the typical cases, we use the real part of beta and + accumulate directly into the output matrix c. */ \ +\ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev new file mode 100644 index 000000000..3760bdd7c --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev @@ -0,0 +1,188 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /*const bool_t row_pref = !col_pref;*/ \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c index 7d746304c..c4ec44b54 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \ \ const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ const dim_t k2 = 2 * k; \ \ diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c index 92da659ca..ab5617795 100644 --- a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ @@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ From 69b4846ae9adb157c4171b52e159684db2867853 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 21 Feb 2017 15:33:39 -0600 Subject: [PATCH 04/64] Disabled experiment-related 1m code. Details: - Commented out code in frame/ind/oapi/bli_l3_3m4m1m_oapi.c that was specifically inserted to facilitate the benchmarking of 1m block-panel and panel-block algorithms. - Updates to test/3m4m/Makefile, runme.sh script, and test_gemm.c to reflect changes used/needed during benchmarking. --- frame/ind/oapi/bli_l3_3m4m1m_oapi.c | 7 +++++ test/3m4m/Makefile | 43 +++++++++++++++++++------ test/3m4m/runme.sh | 49 ++++++++++++++++++++++------- test/3m4m/test_gemm.c | 18 +++++++++-- 4 files changed, 94 insertions(+), 23 deletions(-) diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 36281f543..b99ebda39 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -65,11 +65,18 @@ void PASTEMAC(opname,imeth) \ \ /* A temporary hack to easily specify the 1m algorithm (block-panel or panel-block). */ \ +/* if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ { \ bli_gemm1mbp( alpha, a, b, beta, c ); \ return; \ } \ + else if ( PASTEMAC(opname,imeth) == bli_gemm3m1 ) \ + { \ + bli_gemm1mpb( alpha, a, b, beta, c ); \ + return; \ + } \ +*/ \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 0b13b8eb1..433e745a7 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -107,8 +107,9 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a # BLAS library path(s). This is where the BLAS libraries reside. HOME_LIB_PATH := $(HOME)/flame/lib #MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 -MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 -ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib @@ -168,7 +169,7 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH) LINKER := $(CC) LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64 -LDFLAGS += -lgfortran -lm -lpthread -fopenmp +LDFLAGS += -lgfortran -lm -lrt -lpthread -fopenmp # Datatype @@ -211,13 +212,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=100 \ - -DP_END=1000 \ - -DP_INC=100 - -PDEF_MT := -DP_BEGIN=100 \ +PDEF_ST := -DP_BEGIN=40 \ -DP_END=2000 \ - -DP_INC=100 + -DP_INC=40 + +PDEF_MT := -DP_BEGIN=200 \ + -DP_END=10000 \ + -DP_INC=200 @@ -296,6 +297,8 @@ openblas-gemm-st: \ test_zgemm_openblas_st.x openblas-gemm-mt: \ + test_sgemm_openblas_mt.x \ + test_dgemm_openblas_mt.x \ test_cgemm_openblas_mt.x \ test_zgemm_openblas_mt.x @@ -306,6 +309,8 @@ mkl-gemm-st: \ test_zgemm_mkl_st.x mkl-gemm-mt: \ + test_sgemm_mkl_mt.x \ + test_dgemm_mkl_mt.x \ test_cgemm_mkl_mt.x \ test_zgemm_mkl_mt.x @@ -316,6 +321,8 @@ acml-gemm-st: \ test_zgemm_acml_st.x acml-gemm-mt: \ + test_sgemm_acml_mt.x \ + test_dgemm_acml_mt.x \ test_cgemm_acml_mt.x \ test_zgemm_acml_mt.x @@ -468,6 +475,12 @@ test_z%_openblas_st.o: test_%.c test_c%_openblas_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@ +test_d%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + +test_s%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + test_z%_openblas_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ @@ -487,6 +500,12 @@ test_z%_mkl_st.o: test_%.c test_c%_mkl_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@ +test_d%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + +test_s%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + test_z%_mkl_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ @@ -506,6 +525,12 @@ test_z%_acml_st.o: test_%.c test_c%_acml_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ +test_d%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + +test_s%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + test_z%_acml_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index 794f0ba00..3f5d89023 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -4,17 +4,21 @@ exec_root="test" out_root="output" -sys="blis" +#sys="blis" #sys="stampede" +sys="lonestar" #sys="wahlberg" # Bind threads to processors. #export OMP_PROC_BIND=true #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15" #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" -export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" #export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7" #export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7" +#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45" +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" # Modify LD_LIBRARY_PATH. if [ ${sys} = "blis" ]; then @@ -26,6 +30,11 @@ elif [ ${sys} = "stampede" ]; then # A hack to use libiomp5 with gcc. export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64" +elif [ ${sys} = "lonestar" ]; then + + # A hack to use libiomp5 with gcc. + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + elif [ ${sys} = "wahlberg" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HOME/flame/lib/acml/5.3.1/gfortran64_int64/lib" @@ -49,6 +58,14 @@ elif [ ${sys} = "stampede" ]; then ir_nt=1 # 1st loop nt=16 +elif [ ${sys} = "lonestar" ]; then + + jc_nt=2 # 5th loop + ic_nt=12 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=24 + elif [ ${sys} = "wahlberg" ]; then jc_nt=1 # 5th loop @@ -59,8 +76,10 @@ elif [ ${sys} = "wahlberg" ]; then fi # Threadedness to test. -threads="st mt" # st mt" -threads_r="st mt" # mt" +#threads="mt" +#threads_r="mt" +threads="st" +threads_r="st" # Datatypes to test. dts="z c" @@ -82,6 +101,14 @@ elif [ ${sys} = "stampede" ]; then test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" #test_impls="openblas mkl asm_blis" +elif [ ${sys} = "lonestar" ]; then + + test_impls="asm_blis 4mhw_blis 4m1a_blis 1m_blis 3m1_blis" + #test_impls="1m_blis 3m1_blis" + #test_impls="4m1a_blis" + #test_impls="mkl" + #test_impls="openblas mkl asm_blis" + elif [ ${sys} = "wahlberg" ]; then test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" @@ -90,7 +117,8 @@ fi # Real domain implementations to test. #test_impls_r="openblas mkl asm_blis" -test_impls_r="openblas asm_blis" +test_impls_r="asm_blis" +#test_impls_r="" # First perform real test cases. for th in ${threads_r}; do @@ -112,10 +140,11 @@ for th in ${threads_r}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" + # export MKL_NUM_THREADS=${nt} #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -124,7 +153,6 @@ for th in ${threads_r}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. @@ -165,10 +193,10 @@ for th in ${threads}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -177,7 +205,6 @@ for th in ${threads}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 7b16f584f..1f9ea036c 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -49,6 +49,7 @@ int main( int argc, char** argv ) dim_t p; dim_t p_begin, p_end, p_inc; int m_input, n_input, k_input; + ind_t ind; num_t dt; char dt_ch; int r, n_repeats; @@ -70,6 +71,8 @@ int main( int argc, char** argv ) dt = DT; + ind = IND; + p_begin = P_BEGIN; p_end = P_END; p_inc = P_INC; @@ -78,12 +81,21 @@ int main( int argc, char** argv ) n_input = -1; k_input = -1; -#if 0 + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 1 cntx_t cntx; + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + // Initialize a context for the current induced method and datatype. - bli_gemmind_cntx_init( IND, dt, &cntx ); + bli_gemmind_cntx_init( ind_mod, dt, &cntx ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); @@ -157,7 +169,7 @@ int main( int argc, char** argv ) #ifdef BLIS bli_ind_disable_all_dt( dt ); - bli_ind_enable_dt( IND, dt ); + bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX; From f484c6cd4389dc7ae5b972849e12e98ad5bbf9a4 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 17 Mar 2017 12:07:27 -0500 Subject: [PATCH 05/64] Whitespace reformatting to armv8a kernels file. Details: - Updated formatting of function signature/header in kernels/armv8a/3/bli_gemm_opt_4x4.c. --- kernels/armv8a/3/bli_gemm_opt_4x4.c | 122 +++++++++++++++------------- 1 file changed, 66 insertions(+), 56 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_opt_4x4.c b/kernels/armv8a/3/bli_gemm_opt_4x4.c index 992750b93..479c2b624 100644 --- a/kernels/armv8a/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/3/bli_gemm_opt_4x4.c @@ -50,16 +50,17 @@ * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_sgemm_opt_8x12( - dim_t k, - float* restrict alpha, - float* restrict a, - float* restrict b, - float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_sgemm_opt_8x12 + ( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -1100,16 +1101,17 @@ __asm__ volatile * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ -void bli_dgemm_opt_6x8( - dim_t k, - double* restrict alpha, - double* restrict a, - double* restrict b, - double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_dgemm_opt_6x8 + ( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -2070,47 +2072,55 @@ __asm__ volatile } -void bli_cgemm_opt_4x4( - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a, - scomplex* restrict b, - scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_cgemm_opt_4x4 + ( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { /* Just call the reference implementation. */ - BLIS_CGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data, - cntx ); + BLIS_CGEMM_UKERNEL_REF + ( + k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data, + cntx + ); } -void bli_zgemm_opt_4x4( - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a, - dcomplex* restrict b, - dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data, - cntx_t* restrict cntx - ) +void bli_zgemm_opt_4x4 + ( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) { /* Just call the reference implementation. */ - BLIS_ZGEMM_UKERNEL_REF( k, - alpha, - a, - b, - beta, - c, rs_c, cs_c, - data, - cntx ); + BLIS_ZGEMM_UKERNEL_REF + ( + k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data, + cntx + ); } From 6e7de6ef84babb273dc5528a9b9d01f0febe394b Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 17 Mar 2017 12:10:24 -0500 Subject: [PATCH 06/64] Minor updates to test/3m4m. Details: - Updated initial problem size and increment in Makefile. - Updated code in test_gemm.c to correctly query kc from context. --- test/3m4m/Makefile | 4 ++-- test/3m4m/test_gemm.c | 26 +++++++++++--------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 9e982032f..7e1fd33bb 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -209,9 +209,9 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=80 \ +PDEF_ST := -DP_BEGIN=40 \ -DP_END=2000 \ - -DP_INC=80 + -DP_INC=40 PDEF_MT := -DP_BEGIN=80 \ -DP_END=4000 \ diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index c8e9ec5d5..c00ca4e25 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -78,24 +78,20 @@ int main( int argc, char** argv ) n_input = -1; k_input = -1; -#if 0 - num_t dt_real = bli_datatype_proj_to_real( DT ); - cntx_t cntx; +#if 1 - bli_gemm_cntx_init( &cntx ); + cntx_t cntx; - // Extract the kc blocksize for the requested datatype and its - // real analogue. - dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); - dim_t kc_real = bli_cntx_get_blksz_def_dt( dt_real, BLIS_KC, &cntx ); + // Initialize a context for the current induced method and datatype. + bli_gemm_cntx_init( &cntx ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + +#elif 0 + + k_input = 256; - // Assign the k dimension depending on which implementation is - // being tested. Note that the BLIS_NAT case handles the real - // domain cases as well as native complex. - if ( IND == BLIS_NAT ) k_input = kc; - else if ( IND == BLIS_3M1 ) k_input = kc_real / 3; - else if ( IND == BLIS_4M1A ) k_input = kc_real / 2; - else k_input = kc_real; #endif // Choose the char corresponding to the requested datatype. From ca3a7924770d6cf203cce4ca9f5482e1d0d4e961 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 12:09:39 -0500 Subject: [PATCH 07/64] README.md update. Details: - Updated bibtex entries for 4th BLIS paper, and adds entries for 5th and 6th BLIS papers. --- README.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9bfa84285..1d7b0ce34 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving ``` A fourth paper, submitted to ACM TOMS, also exists, which proposes an -[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS: +[analytical model](http://dl.acm.org/citation.cfm?id=2925987) +([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)) +for determining blocksize parameters in BLIS: ``` @article{BLIS4, @@ -277,6 +279,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an } ``` +A fifth paper, submitted to ACM TOMS, begins the study of so-called +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): + +``` +@article{BLIS5, + author = {Field G. {V}an~{Z}ee and Tyler Smith}, + title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods}, + journal = {ACM Transactions on Mathematical Software}, + year = {2017}, + note = {accepted} +} +``` + +A sixth paper, submitted to ACM TOMS, revisits the topic of the previous +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)): + +``` +@article{BLIS6, + author = {Field G. {V}an~{Z}ee}, + title = {Implementing high-performance complex matrix multiplication via the 1m method}, + journal = {ACM Transactions on Mathematical Software}, + note = {submitted} +} +``` + + Funding ------- From d5a5e003ea9b24bb6abf12e88862e8eb61ffb03d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 15:48:30 -0500 Subject: [PATCH 08/64] Fixed a trsm1m bug that affected right-side cases. Details: - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result was nondeterministic behavior (usually segmentation faults) for certain problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c which explicitly directed the virtual gemm micro-kernel to use temporary space if the storage preference of the [real domain] gemm ukernel did not match the storage of the output matrix C. In the context of gemm, this handling is not needed because agreement between the storage pref and the matrix is guaranteed by a high-level optimization in BLIS. However, this optimization is not applied to trsm because the storage of C is not necessarily the same as the storage of the micro-panels of B--both of which are updated by the micro-kernel during a trsm operation. Thus, the guarantee of storage/preference agreement is not in place for trsm, which means we must handle that case within the virtual gemm micro-kernel. - Comment updates and a minor macro change to bli_trsm*_cntx_init() for 3m1, 4m1a, and 1m. --- frame/ind/cntx/bli_trsmind_cntx.c | 30 +++++++++++--------- frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c | 21 ++++++-------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index a13d0d05a..96f9add60 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -73,10 +73,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -122,10 +121,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) cntx ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +172,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + // BLIS_PACKED_COL_PANELS_1R, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,11 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + // BLIS_PACKED_COL_PANELS_1E, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); } } diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index ff23a36f4..6279ab762 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - /*const bool_t row_pref = !col_pref;*/ \ + const bool_t row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -94,24 +94,19 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ -\ - /* Sanity check: These should never occur because storage/preference - agreement is handled at a higher level. */ \ - /* - if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ - */ \ -\ \ /* If beta has a non-zero imaginary component OR if c is stored with general stride, then we compute the alpha*a*b product into temporary storage and then accumulate that result into c afterwards. Note that the other two cases concerning disagreement between the storage of C - and the output preference of the micro-kernel, should never occur - (though we could handle them if they did occur). */ \ + and the output preference of the micro-kernel, should ONLY occur in + the context of trsm, whereby this virtual micro-kernel is called + directly from the trsm macro-kernel to update the micro-tile b11 + that exists within the packed row-panel of B. Indeed that is the + reason those cases MUST be explicitly handled. */ \ if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ - /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ From 940a707ac78de975110e17c95765e65b89aa5e10 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:38:42 -0500 Subject: [PATCH 09/64] Version file update (0.2.2) --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 566318cf2..ee1372d33 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1-82 +0.2.2 From a4f1d0b8801c114e9ef8be39df01e1b8d27ebcb3 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:38:43 -0500 Subject: [PATCH 10/64] CHANGELOG update (0.2.2) --- CHANGELOG | 1179 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 1090 insertions(+), 89 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a361ceac3..c9a04cbde 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,18 +1,706 @@ -commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +commit 940a707ac78de975110e17c95765e65b89aa5e10 (HEAD -> master, tag: 0.2.2) +Author: Field G. Van Zee +Date: Tue May 2 16:38:42 2017 -0500 + + Version file update (0.2.2) + +commit d5a5e003ea9b24bb6abf12e88862e8eb61ffb03d (origin/master, origin/HEAD, origin/1m, 1m) +Author: Field G. Van Zee +Date: Tue May 2 15:48:30 2017 -0500 + + Fixed a trsm1m bug that affected right-side cases. + + Details: + - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result + was nondeterministic behavior (usually segmentation faults) for certain + problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The + cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c + which explicitly directed the virtual gemm micro-kernel to use temporary + space if the storage preference of the [real domain] gemm ukernel did + not match the storage of the output matrix C. In the context of gemm, + this handling is not needed because agreement between the storage pref + and the matrix is guaranteed by a high-level optimization in BLIS. + However, this optimization is not applied to trsm because the storage + of C is not necessarily the same as the storage of the micro-panels of + B--both of which are updated by the micro-kernel during a trsm + operation. Thus, the guarantee of storage/preference agreement is not + in place for trsm, which means we must handle that case within the + virtual gemm micro-kernel. + - Comment updates and a minor macro change to bli_trsm*_cntx_init() for + 3m1, 4m1a, and 1m. + +commit e80993e71f4d571e9650a8e90ed386e32059eae5 +Merge: a509fbd5 ca3a7924 +Author: Field G. Van Zee +Date: Tue May 2 12:30:28 2017 -0500 + + Merge branch 'master' into 1m + +commit ca3a7924770d6cf203cce4ca9f5482e1d0d4e961 +Author: Field G. Van Zee +Date: Tue May 2 12:09:39 2017 -0500 + + README.md update. + + Details: + - Updated bibtex entries for 4th BLIS paper, and adds entries for 5th + and 6th BLIS papers. + +commit 6e7de6ef84babb273dc5528a9b9d01f0febe394b +Author: Field G. Van Zee +Date: Fri Mar 17 12:10:24 2017 -0500 + + Minor updates to test/3m4m. + + Details: + - Updated initial problem size and increment in Makefile. + - Updated code in test_gemm.c to correctly query kc from context. + +commit f484c6cd4389dc7ae5b972849e12e98ad5bbf9a4 +Author: Field G. Van Zee +Date: Fri Mar 17 12:07:27 2017 -0500 + + Whitespace reformatting to armv8a kernels file. + + Details: + - Updated formatting of function signature/header in + kernels/armv8a/3/bli_gemm_opt_4x4.c. + +commit a509fbd5ac04fafd4e51b43d2f59ca56432dc212 +Merge: 69b4846a 513944e4 +Author: Field G. Van Zee +Date: Tue Feb 21 17:06:16 2017 -0600 + + Merge branch 'master' into 1m + +commit 69b4846ae9adb157c4171b52e159684db2867853 +Author: Field G. Van Zee +Date: Tue Feb 21 15:33:39 2017 -0600 + + Disabled experiment-related 1m code. + + Details: + - Commented out code in frame/ind/oapi/bli_l3_3m4m1m_oapi.c that was + specifically inserted to facilitate the benchmarking of 1m block-panel + and panel-block algorithms. + - Updates to test/3m4m/Makefile, runme.sh script, and test_gemm.c to + reflect changes used/needed during benchmarking. + +commit 513944e4a951d8823b4de161b86ad7a965b4d99b +Merge: 8b462a0e 0e18f68c +Author: Devin Matthews +Date: Mon Feb 20 10:04:33 2017 -0500 + + Merge pull request #118 from devinamatthews/master + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 0e18f68cf12eb9189ba901a20040b1cdae417670 +Author: Devin Matthews +Date: Mon Feb 20 09:03:21 2017 -0600 + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 8b462a0e8c3e9252f0401940849e53cc772256fa +Merge: c362afc5 7d42fc07 +Author: Devin Matthews +Date: Sun Feb 19 23:03:03 2017 -0500 + + Merge pull request #117 from devinamatthews/master + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit 7d42fc0796ef0c010375fd8e59b1240ba41ce4d2 +Author: Devin Matthews +Date: Sun Feb 19 21:10:55 2017 -0500 + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit c362afc525bab4050581d1b0fcea2fe4d582c608 +Author: Field G. Van Zee +Date: Thu Feb 9 11:54:59 2017 -0600 + + Added missing "level-0" BLAS [sd]cabs1_(). + + Details: + - Fixed issue #115 by adding implementations for scabs1_() and dcabs1_() + to the BLAS compatibility layer. Thanks to heroxbd for pointing out + their absence. + +commit 018180c938c32efbeaaf626ba71ec5b780664db1 +Author: Field G. Van Zee +Date: Wed Feb 8 11:20:52 2017 -0600 + + Fixed a minor bug in configure (issue #114). + + Details: + - Fixed a bug in the configure script whereby a non-preferred value for + --enable-threading would cause problems in common.mk vis-a-vis detecting + which threading model was chosen. Thanks to heroxbd for reporting this + issue. + +commit ddf45e71770c55ea4a58ca24ea4913fe5d8beb9b +Merge: a6ab91bc 78e1b16e +Author: Devin Matthews +Date: Fri Jan 27 14:25:40 2017 -0600 + + Merge pull request #113 from devinamatthews/knl_thread_params + + Change default threading parameters for KNL. + +commit 78e1b16e16d589ed31b2e712115ee282097f114d +Author: Devin Matthews +Date: Fri Jan 27 14:22:20 2017 -0600 + + Change default threading parameters for KNL. + +commit 1c732d3ddc4ac0861d3b0e0dd15eb7e071615502 +Author: Field G. Van Zee +Date: Wed Jan 25 16:25:46 2017 -0600 + + Added 1m-specific APIs for bp, pb gemm algorithms. + + Details: + - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the + body of bli_gemm_cntl_create() replaced with a call to the former. + - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, + bli_cntl_free() can check if the thread parameter is NULL, and if so, + call the latter, and otherwise call the former. + - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in + terms of bli_gemm1mxx_cntx_init(), which behaves the same as + bli_gemm1m_cntx_init() did before, except that an extra bool parameter + (is_pb) is used to support both bp and pb algorithms (including to + support the anti-preference field described below). + - Added support for "anti-preference" in context. The anti_pref field, + when true, will toggle the boolean return value of routines such as + bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of + causing BLIS to transpose the operation to achieve disagreement (rather + than agreement) between the storage of C and the micro-kernel output + preference. This disagreement is needed for panel-block implementations, + since they induce a transposition of the suboperation immediately before + the macro-kernel is called, which changes the apparent storage of C. For + now, anti-preference is used only with the pb algorithm for 1m (and not + with any other non-1m implementation). + - Defined new functions, + bli_cntx_l3_ukr_eff_prefers_storage_of() + bli_cntx_l3_ukr_eff_dislikes_storage_of() + bli_cntx_l3_nat_ukr_eff_prefers_storage_of() + bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() + which are identical to their non-"eff" (effectively) counterparts except + that they take the anti-preference field of the context into account. + - Explicitly initialize the anti-pref field to FALSE in + bli_gks_cntx_set_l3_nat_ukr_prefs(). + - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel + in terms of the existing block-panel macro-kernel _ker_var2(). This + technique requires inducing transposes on all operands and swapping + the A and B. + - Changed bli_obj_induce_trans() macro so that pack-related fields are + also changed to reflect the induced transposition. + - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily + specify the 1m algorithm (block-panel or panel-block). + - Renamed the following cntx_t-related macros: + bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() + bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() + bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() + and updated all instantiations. Also updated the field names in the + cntx_t struct. + - Comment updates. + +commit a6ab91bc61432490fadf18d596de4589645f37dd +Merge: 145a551d 7f31a630 +Author: Field G. Van Zee +Date: Wed Nov 30 09:26:58 2016 -0600 + + Merge pull request #111 from figual/master + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 7f31a6307b7bd35f913c895947552c3a176f789b +Author: Francisco Igual +Date: Sun Nov 27 14:40:47 2016 +0100 + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 126482a3b609b9ad7026ba348f6c4bf6a29be8a1 +Author: Field G. Van Zee +Date: Fri Nov 25 18:29:49 2016 -0600 + + Implemented the 1m method. + + Details: + - Implemented the 1m method for inducing complex domain matrix + multiplication. 1m support has been added to all level-3 operations, + including trsm, and is now the default induced method when native + complex domain gemm microkernels are omitted from the configuration. + - Updated _cntx_init() operations to take a datatype parameter. This was + needed for the corresponding function for 1m (because 1m requires us + to choose between column-oriented or row-oriented execution, which + requires us to query the context for the storage preference of the + gemm microkernel, which requires knowing the datatype) but I decided + that it made sense for consistency to add the parameter to all other + cntx initialization functions as well, even though those functions + don't use the parameter. + - Updated bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs() to take + a second scalar for each blocksize entry. The semantic meaning of the + two scalars now is that the first will scale the default blocksize + while the second will scale the maximum blocksize. This allows scaling + the two independently, and was needed to support 1m, which requires + scaling for a register blocksize but not the register storage + blocksize (ie: "packdim") analogue. + - Deprecated bli_blksz_reduce_dt_to() and defined two new functions, + bli_blksz_reduce_def_to() and bli_blksz_reduce_max_to(), for reducing + default and maximum blocksizes to some desired blocksize multiple. + These functions are needed in the updated definitions of + bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs(). + - Added support for the 1e and 1r packing schemas to packm, including + 1e/1r packing kernels. + - Added a minor optimization to bli_gemm_ker_var2() that allows, under + certain circumstances (specifically, real domain beta and row- or + column-stored matrix C), the real domain macrokernel and microkernel + to be called directly, rather than using the virtual microkernel + via the complex domain macrokernel, which carries a slight additional + amount of overhead. + - Added 1m support to the testsuite. + - Added 1m support to Makefile and runme.sh in test/3m4m. Also simplified + some code in test_gemm.c driver. + +commit 145a551d524ae5492667a05fc248923d922df850 +Author: Field G. Van Zee +Date: Wed Nov 23 17:59:06 2016 -0600 + + Switched to simpler trsm_r implementation. + + Details: + - Disabled the implementation of trsm_r that allows the right-hand matrix + B to be trianglar, and switched to the implementation that simply + transposes the operation (and thus the storage of C) in order to recast + the operation as trsm_l. This avoids the need to use trsm_rl and trsm_ru + macrokernels, which require an awkward swapping of MR and NR. For now, + the support for trsm_r macrokernels, via separate control trees, remains. + - Modified bli_config_macro_defs.h so that BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + is defined by default. This is mostly a safety precaution in case someone + tries to switch back to the previous trsm_r implementation, but also + serves as a convenience on some systems where one does not naturally + choose blocksizes in a way that satisfies MC % NR = 0 and NC % MR = 0. + +commit b3e58ee30307cf1e11529f2113acb9abbeda25af +Author: Field G. Van Zee +Date: Wed Nov 23 17:58:26 2016 -0600 + + Reimplemented 4x12 haswell ukernels (real only). + + Details: + - Replaced permutation-based implementations in bli_gemm_asm_d4x12.c, which + defines 4x24 single real and 4x12 double real gemm microkernels, with + broadcast-based implementations. (The previous microkernel file has been + moved to an 'old' subdirectory.) + +commit bdc0a264d2fb5940bfd09298b1de823674a39053 +Author: Field G. Van Zee +Date: Wed Nov 16 14:13:08 2016 -0600 + + Adjusted stride selection of ct in macrokernels. + + Details: + - Updated the changes introduced in 618f433 so that the strides of the + temporary microtile ct used in the macrokernels is determined based + on the storage preference of the microkernel (via the new functions + below), rather than the strides of c. In almost all cases, presently, + this change results in no net effect, as a high-level optimization + in the _front() functions aligns the storage of c to that of the + microkernel's preference. However, I encountered some cases where + this is not always the case in some development code that has yet + to be committed, and therefore I'm generalizing the framework code + in advance. + - Defined two new functions in bli_cntx.c: + bli_cntx_l3_ukr_prefers_rows_dt() + bli_cntx_l3_ukr_prefers_cols_dt() + which return bool_t's based on the current micro-kernel's storage + preferences. For induced methods, the preference of the underlying + real domain microkernel is returned. + - Updated definition of bli_cntx_l3_ukr_dislikes_storage_of(), and + by proxy bli_cntx_l3_ukr_prefers_storage_of(), to be in terms of + the above functions, rather than querying the preferences of the + native microkernel directly (which did the wrong thing for induced + methods). + +commit 031978d2647cf08316858baf29c84ebba9c3133e +Author: Field G. Van Zee +Date: Wed Nov 16 14:04:33 2016 -0600 + + Fixed inactive trsm_r blocksize constraint code. + + Details: + - Changed a cpp macro that was meant to prevent using certain trsm_r code + if BLIS_RELAX_MCNR_NCMR_CONSTRAINTS was defined. It was actually coded + incorrectly at first. I've now fixed its location and changed its + consequence to a compile-time #error message. + +commit 6b5a4032d2e3ed29a272c7f738b7e3ed6657e556 +Merge: 3b524a08 a8220e3a +Author: Field G. Van Zee +Date: Thu Nov 10 15:28:24 2016 -0600 + + Merge pull request #109 from devinamatthews/omp_num_threads + + Add automatic loop thread assignment. + +commit a8220e3a86433b5d76789e32ea7ca014a11b6d17 +Author: Devin Matthews +Date: Thu Nov 10 14:19:34 2016 -0600 + + - Fix typo in bli_cntx.c + - Bump BLIS_DEFAULT_NR_THREAD_MAX to 4 + +commit c05b3862f6241486442b313eff0c8bee7b5e1274 +Author: Devin Matthews +Date: Fri Nov 4 15:48:02 2016 -0500 + + Add automatic loop thread assignment. + + - Number of threads is determined by BLIS_NUM_THREADS or OMP_NUM_THREADS, but can be overridden by BLIS_XX_NT as before. + - Threads are assigned to loops (ic, jc, ir, and jc) automatically by weighted partitioning and heuristics, both of which are tunable via bli_kernel.h. + - All level-3 BLAS covered. + +commit 3b524a08e3fb8380e7b8b2ba835312c51a331570 +Author: Field G. Van Zee +Date: Wed Nov 2 17:45:18 2016 -0500 + + Consolidated 3m1/4m1 gemmtrsm, trsm ukernel code. + + Details: + - Consolidated the macros that define the lower and upper versions of the + gemmtrsm microkernels into a single macro that is instantiated twice. + Did this for both 3m1 and 4m1 microkernels. + - Consolidated lower and upper versions of the trsm microkernels for 3m1 + and 4m1 into single files (each). + +commit ead231aca635deb3db270f118454e4222c627f31 +Merge: d25e6f8b 62987f60 +Author: Field G. Van Zee +Date: Wed Nov 2 13:03:50 2016 -0500 + + Merge pull request #108 from devinamatthews/patch-2 + + Update .travis.yml with additional tests + +commit 62987f60a6a6ff0a75b31d0404f493593ce35ccc +Author: Devin Matthews +Date: Wed Nov 2 11:20:37 2016 -0500 + + Allow KNL to fail + +commit 8f9010542c751ae3cbfe6121cb011d8985c1e00d +Author: Devin Matthews +Date: Wed Nov 2 11:18:32 2016 -0500 + + Fix some problems with OSX builds: + + - Update CPU detection for Intel archs (esp. Skylake) + - Allow clang for the reference config + +commit d25e6f8b63c57f30b8a67dffbf4995977cf9f235 +Author: Field G. Van Zee +Date: Tue Nov 1 14:35:15 2016 -0500 + + Can disable trsm_r-specific blocksize constraints. + + Details: + - Added cpp guards around the constraints in bli_kernel_macro_defs.h + that enforce MC % NR = 0 and NC % MR = 0. These constraints are ONLY + needed when handling right-side trsm by allowing the matrix on the + right (matrix B) to be triangular, because it involves swapping + register, but not cache, blocksizes (packing A by NR and B by MR) + and then swapping the operands to gemmtrsm just before that kernel + is called. It may be useful to disable these constraints if, for + example, the developer wishes to test the configuration with + a different set of cache blocksizes where only MC % MR = 0 and + NC % NR = 0 are enforced. + - In summary, #defining BLIS_RELAX_MCNR_NCMR_CONSTRAINTS will bypass + the enforcement of MC % NR = 0 and NC % MR = 0. + +commit 1a67e3688edb073a9d44c160e7b0798e08796b8a +Author: Devin Matthews +Date: Tue Nov 1 13:53:18 2016 -0500 + + Bogus commit + + Need to trigger another Travis build. + +commit 2cd82d67b372cad1bed50cfd99e524f1f40b4e24 +Author: Devin Matthews +Date: Tue Nov 1 13:25:50 2016 -0500 + + Some fixes for .travis.yml + + - Switch to gcc-5 to support knl + - Don't run tests in parallel -- it is super slow. + - Use clang on OSX since gcc is only a zombie husk. + +commit a3db4e6bdfe745083acf704ab0f51f74ea869538 +Author: Devin Matthews +Date: Tue Nov 1 10:33:18 2016 -0500 + + Update .travis.yml with additional tests + + - Test knl configuration (without running of course). + - Test openmp and pthreads threading for auto configuration with 4 threads. + - Test auto configuration with and without pthreads on OSX. + - Also, run make in parallel. + + I don't know how the `addons:` section works on OSX; hopefully it is just ignored. + +commit 8a11a2174a1a5b9426f13bbc5338dc86ab138cdd +Author: Field G. Van Zee +Date: Mon Oct 31 19:07:55 2016 -0500 + + Updates to non-default haswell microkernels. + + Details: + - Updated s and d microkernels in bli_gemm_asm_d8x6.c to relax alignment + constraints. + - Added missing c and z microkernels, which are based on the corresponding + kernels in the d6x8 set. + - This completes the d8x6 set (which may be used for situations when it + is desirable to have a microkernel with a column preference). + +commit 618f4331eba209803ecab99747872eceb1b5f091 +Author: Field G. Van Zee +Date: Mon Oct 31 14:40:51 2016 -0500 + + Align strides of ct in macrokernels to that of c. + + Details: + - Previously, rs_ct and cs_ct, the strides of the temporary microtile used + primarily in the macrokernels' edge case handling, were unconditionally + set to 1 and MR, respectively. However, Devin Matthews noted that this + ought to be changed so that the strides of ct were in agreement with the + strides of C. (That is, if C was row-stored, then ct should be accessed + as by rows as well.) The implicit assumption is that the strides of C + have already been adjusted, via induced transposition, if the storage + preference of the microkernel is at odds with the storage of C. So, if + the microkernel prefers row storage, the macrokernel's interior cases + would present row-stored (ideal) microkernel subproblems to the + microkernel, but for edge cases, it would still see column-stored + subproblems (not ideal). This commit fixes this issue. Thanks to Devin + for his suggestion. + +commit 630391002325a589063aec2ab0a7d89ef2e178c0 +Merge: 956b3edf 216206c1 +Author: Field G. Van Zee +Date: Tue Oct 25 19:34:51 2016 -0500 + + Merge pull request #105 from devinamatthews/knl + + Support for Intel Knight's Landing. + +commit 216206c1d328a865c2192e35a4df6e9aff79a85b +Author: Devin Matthews +Date: Tue Oct 25 13:56:18 2016 -0500 + + Fix up for merge to master. + +commit 11eb7957abbcdf02d5e312898e094260eadb1209 +Merge: cd5b6681 956b3edf +Author: Devin Matthews +Date: Tue Oct 25 13:51:07 2016 -0500 + + Merge branch 'master' into knl + + # Conflicts: + # frame/thread/bli_thread.h + +commit cd5b6681838899283cd94e5427dfda206e7fbabe +Author: Devin Matthews +Date: Tue Oct 25 13:49:27 2016 -0500 + + Don't use %rbp in KNL packing kernels. + +commit 956b3edf8eb09480f31f2e861c1b10f9ecbb2e52 +Merge: b7e41d71 0662a3c1 +Author: Field G. Van Zee +Date: Tue Oct 25 13:02:57 2016 -0500 + + Merge pull request #104 from devinamatthews/misspellings + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit 0662a3c1b1f4644a86bf8e5073d1391808c91b4a +Author: Devin Matthews +Date: Tue Oct 25 12:42:44 2016 -0500 + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit b7e41d71b07d2af6d22d632c70e0c5f7ce46852c +Merge: 4bd905bd 5117d444 +Author: Field G. Van Zee +Date: Mon Oct 24 16:47:46 2016 -0500 + + Merge pull request #103 from devinamatthews/patch-1 + + Change .align to .p2align in Bulldozer ukernels. + +commit 5117d444f7f3a2bc327f067926eaf2398212edda +Author: Devin Matthews +Date: Mon Oct 24 16:20:47 2016 -0500 + + Change .align to .p2align in Bulldozer ukernels + + Apparently OSX doesn't allow .align directives for >16B, so I've changed these to their .p2align counterparts. + +commit 4bd905bd4597e0ad7bedf31e25e779d3e2dfda29 +Merge: 936d5fdc 7f32dd57 +Author: Field G. Van Zee +Date: Fri Oct 21 14:48:44 2016 -0500 + + Merge pull request #93 from ShadenSmith/config_check + + Adds sanity check to configuration choice. + +commit 936d5fdc26c6c4dab199a8d11fde948975cfa1d6 +Author: Field G. Van Zee +Date: Fri Oct 21 14:34:27 2016 -0500 + + Fixed multithreading compilation bug in 970745a. + + Details: + - Moved the definition of the cpp macro BLIS_ENABLE_MULTITHREADING + from bli_thread.h to bli_config_macro_defs.h. Also moved the + sanity check that OpenMP and POSIX threads are not both enabled. + - Thanks to Krzysztof Drewniak for reporting this bug. + +commit 8feb0f85a674e84bec2417486e3bcea584b14c04 +Author: Field G. Van Zee +Date: Wed Oct 19 16:05:41 2016 -0500 + + Removed auto-prototyping of malloc()/free() substitutes. + + Details: + - Removed the header file, bli_malloc_prototypes.h, which automatically + generated prototypes for the functions specified by the following + cpp macros: + BLIS_MALLOC_INTL + BLIS_FREE_INTL + BLIS_MALLOC_POOL + BLIS_FREE_POOL + BLIS_MALLOC_USER + BLIS_FREE_USER + These prototypes were originally provided primarily as a convenience + to those developers who specified their own malloc()/free() substitutes + for one or more of the following. However, we generated these prototypes + regardless, even when the default values (malloc and free) of the + macros above were used. A problem arose under certain circumstances + (e.g., gcc in C++ mode on Linux with glibc) when including blis.h that + stemmed from the "throw" specification which was added to the glibc's + malloc() prototype, resulting in a prototype mismatch. Therefore, going + forward, developers who specify their own custom malloc()/free() + substitutes must also prototype those substitutes via bli_kernel.h. + Thanks to Krzysztof Drewniak for reporting this bug, and Devin Matthews + for researching the nature and potential solutions. + +commit 970745a5fc7c29de3e202988e5eb104fabca4fdc +Author: Field G. Van Zee +Date: Wed Oct 19 15:58:03 2016 -0500 + + Reorganized typedefs to avoid compiler warnings. + + Details: + - Relocated membrk_t definition from bli_membrk.h to bli_type_defs.h. + - Moved #include of bli_malloc.h from blis.h to bli_type_defs.h. + - Removed standalone mtx_t and mutex_t typedefs in bli_type_defs.h. + - Moved #include of bli_mutex.h from bli_thread.h to bli_typedefs.h. + - The redundant typedefs of membrk_t and mtx_t caused a warning on some C + compilers. Thanks to Tyler Smith for reporting this issue. + +commit 28b2af8a71133ce68774e153b6e05afb05affba8 +Author: Field G. Van Zee +Date: Thu Oct 13 14:50:08 2016 -0500 + + Added disabled code to print thrinfo_t structures. + + Details: + - Added cpp-guarded code to bli_thrcomm_openmp.c that allows a curious + developer to print the contents of the thrinfo_t structures of each + thread, for verification purposes or just to study the way thread + information and communicators are used in BLIS. + - Enabled some previously-disabled code in bli_l3_thrinfo.c for freeing + an array of thrinfo_t* values that is used in the new, cpp-guarde code + mentioned above. + - Removed some old commented lines from bli_gemm_front.c. + +commit 11eed3f683d09e65f721567b346b0f733bff9a64 +Author: Field G. Van Zee +Date: Thu Oct 13 14:23:23 2016 -0500 + + Fixed a configure -t omp/openmp bug from fd04869. + + Details: + - Forgot to update certain occurrences of "omp" in common.mk during + commit fd04869, which changed the preferred configure option string + for enabling OpenMP from "omp" to "openmp". + +commit 9cda6057eaa16a24ac8785a9fa167df6c9edba44 +Author: Field G. Van Zee +Date: Tue Oct 11 13:21:26 2016 -0500 + + Removed previously renamed/old files. + + Details: + - Removed frame/base/bli_mem.c and frame/include/bli_auxinfo_macro_defs.h, + both of which were renamed/removed in 701b9aa. For some reason, these + files survived when the compose branch was merged back into master. + (Clearly, git's merging algorithm is not perfect.) + - Removed frame/base/bli_mem.c.prev (an artifact of the long-ago changed + memory allocator that I was keeping around for no particular reason). + +commit 22377abd84b9e560ffe1c4e4d284eb443ddb7133 +Author: Field G. Van Zee +Date: Mon Oct 10 13:43:56 2016 -0500 + + Fixed bli_gemm() segfault on empty C matrices. + + Details: + - Fixed a bug that would manifest in the form of a segmentation fault + in bli_cntl_free() when calling any level-3 operation on an empty + output matrix (ie: m = n = 0). Specifically, the code previously + assumed that the entire control tree was built prior to it being + freed. However, if the level-3 operation performs an early exit, the + control tree will be incomplete, and this scenario is now handled. + Thanks to Elmar Peise for reporting this bug. + +commit 0b571cd94d9b175331c9453258a6b1389a718ae8 +Author: Field G. Van Zee +Date: Thu Oct 6 14:48:15 2016 -0500 + + Fixed segfault in bli_free_align() for NULL ptrs. + + Details: + - Fixed a bug in bli_free_align() caused by failing to handle NULL pointers + up-front, which led to performing pointer arithmetic on NULL pointers in + order to free the address immediately before the pointer. Thanks to Devin + Matthews for reporting this bug. + +commit 4fb9b4ef2e4cf2626a6e000a41628fb823f16da8 +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:35 2016 -0500 + + CHANGELOG update (0.2.1) + +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (tag: 0.2.1) Author: Field G. Van Zee Date: Wed Oct 5 14:41:34 2016 -0500 Version file update (0.2.1) -commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) -Merge: 8696987 6f71cd3 +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 +Merge: 86969873 6f71cd34 Author: Field G. Van Zee Date: Wed Oct 5 13:35:01 2016 -0500 Merge branch 'compose' -commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) -Merge: c0630c4 8d55033 +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose, compose) +Merge: c0630c40 8d55033c Author: Field G. Van Zee Date: Tue Oct 4 15:53:46 2016 -0500 @@ -92,14 +780,20 @@ Date: Tue Sep 27 14:14:11 2016 -0500 should be considered deprecated. commit 9424af87209e4e435e2e742430945152690170b0 -Merge: efa7341 c0630c4 +Merge: efa7341d c0630c40 Author: Field G. Van Zee Date: Tue Sep 27 12:51:08 2016 -0500 Merge branch 'compose' +commit 7f32dd57c6bd41c0704341752842277dd6a4c8eb +Author: Shaden Smith +Date: Sat Sep 17 11:33:57 2016 -0500 + + Adds sanity check to configuration choice. + commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e -Merge: 121c39d e1453f6 +Merge: 121c39d4 e1453f68 Author: Field G. Van Zee Date: Fri Sep 16 11:01:57 2016 -0500 @@ -113,7 +807,7 @@ Date: Fri Sep 16 09:29:28 2016 -0500 Fixes broken URL in README.md -commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +commit c0630c4024b08750043a2942a3e8a037aa6b6259 Author: Field G. Van Zee Date: Mon Sep 12 13:59:02 2016 -0500 @@ -125,7 +819,7 @@ Date: Mon Sep 12 13:59:02 2016 -0500 - Minor changes to frame/thread/bli_thrinfo.h. commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 -Merge: 3550981 121c39d +Merge: 35509818 121c39d4 Author: Field G. Van Zee Date: Tue Sep 6 15:47:13 2016 -0500 @@ -287,7 +981,7 @@ Date: Fri Aug 26 19:04:45 2016 -0500 implementations can slow down the testsuite considerably. commit 73517f522b69de429dd7f3df60a70c068149ab28 -Merge: c6f5c21 50293da +Merge: c6f5c215 50293da3 Author: Field G. Van Zee Date: Tue Aug 23 13:46:59 2016 -0500 @@ -315,7 +1009,7 @@ Date: Tue Aug 23 13:38:36 2016 -0500 which requires "0" or "1". commit c6f5c215ee793d03ea834469fc2adc53feaffc42 -Merge: d52cb76 16a4c7a +Merge: d52cb767 16a4c7a8 Author: Field G. Van Zee Date: Mon Aug 22 17:33:02 2016 -0500 @@ -333,8 +1027,48 @@ Date: Fri Aug 19 11:38:36 2016 -0500 to type mismatch, and in the case of pthreads, a missing function argument. The bugs are fairly recent, introduced in a017062. +commit c8e4ef93953ba2b79fb7e0973c08469c0e28a2cd +Author: Devin Matthews +Date: Wed Aug 3 16:13:03 2016 -0500 + + Add prefetchw to 30x8 kernel. + +commit 4b5a2f3d6e7ffeb5cc2be8448554f5c2083ad68f +Merge: 380736bf 9f52a587 +Author: Devin Matthews +Date: Wed Aug 3 16:09:51 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + + # Conflicts: + # kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c + +commit 380736bfe955efbdd7274c90b6fd635688e83bc4 +Author: Devin Matthews +Date: Wed Aug 3 16:08:28 2016 -0500 + + Add (new) 30x8 KNL kernel and fix non-scatter prefetch bug. + +commit 9f52a587dee855daa73c194e41b6951416544e9a +Author: Devin Matthews +Date: Wed Aug 3 16:03:53 2016 -0500 + + Try prefetchw[t1] instead of regular prefetch for C. + +commit 8945a1512d366bc6a8a85718d12cbf5de6f2898b +Author: Devin Matthews +Date: Wed Aug 3 11:28:24 2016 -0500 + + This version gets ~1550 GFLOPs on KNL wuth 16x4. + +commit 6ce4c022ebdea00c2b951090e3c2e9e88735b9ce +Author: Devin Matthews +Date: Wed Jul 27 16:26:36 2016 -0500 + + Switch back to 24x8. I could only squeeze 24.5GFLOP out of 8x24, and scalability is not improved. + commit d52cb7671509592a8078729477b40b60380518a2 -Merge: 95abea4 c31b1e7 +Merge: 95abea46 c31b1e7b Author: Field G. Van Zee Date: Wed Jul 27 16:04:55 2016 -0500 @@ -357,8 +1091,87 @@ Date: Wed Jul 27 15:58:07 2016 -0500 - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). - Minor update (vis-a-vis contexts) to driver code in test/3m4m. +commit b8f2b55532849d45d379afbdd05a52ff6100800d +Author: Devin Matthews +Date: Wed Jul 27 15:22:55 2016 -0500 + + Try an 8x24 kernel for the hell of it. + +commit 7ede5863ae3567f7c0852efc2d5cd649ca19e0f3 +Author: Devin Matthews +Date: Wed Jul 27 13:41:27 2016 -0600 + + Allocate pack buffer on MCDRAM for KNL. + +commit ad89ed2e829c7b261d8ba0998a3cb83ad576ee04 +Merge: 2c9de740 81e2b05f +Author: Devin Matthews +Date: Wed Jul 27 11:45:40 2016 -0500 + + Merge branch 'knl' of github.com:devinamatthews/blis into knl + +commit 2c9de740edb66c4692c200731763bbd1d3171ccb +Author: Devin Matthews +Date: Wed Jul 27 11:44:54 2016 -0500 + + This version gets ~26GF on one core. + +commit 81e2b05f31bca4e1e1676e7b533d1868d9f9be33 +Author: Devin Matthews +Date: Wed Jul 27 11:39:05 2016 -0500 + + Add optimized packing kernels for KNL. + +commit a7d8ca97b8d835c32d90ff20a565c82733f014a8 +Author: Devin Matthews +Date: Mon Jul 25 15:15:13 2016 -0500 + + All fixed. + +commit 963d0393b023f4134bb0c682923faf9964c0e645 +Author: Devin Matthews +Date: Mon Jul 25 14:40:53 2016 -0500 + + Add 24xk pack kernel. + +commit 117b76739afba481768897d2580f8365d3345417 +Author: Devin Matthews +Date: Mon Jul 25 13:53:07 2016 -0500 + + In the midst of debugging. + +commit 8c0a4fd1d3535d608a9a309a61ffee0a73c3646f +Author: Devin Matthews +Date: Mon Jul 25 13:09:24 2016 -0500 + + Fix some row/column confusion. + +commit c44f9f96930312125b15e64c326ab5ab5cc02633 +Author: Devin Matthews +Date: Mon Jul 25 12:02:24 2016 -0500 + + Simplify displacements -- clang assembler was badly botching EVEX compressed displacements giving false alarms for instruction length. + +commit e0cce177cc1b47ec9f11ac0556241feaa3564df1 +Author: Devin Matthews +Date: Mon Jul 25 10:02:25 2016 -0500 + + Minor fixes for 8x24 KNL kernel. + +commit 65735bbedf75784c48bd11e05b3fdc98fc66b4bc +Author: Devin Matthews +Date: Sun Jul 24 21:50:32 2016 -0500 + + Switch to 24x8 kernel, unrolled by 16. + +commit 45d5dc97177117220bd9dd0abf85aafc185acad1 +Author: Devin Matthews +Date: Sun Jul 24 14:25:26 2016 -0500 + + Add 24x8 "KNC-style" kernel for KNL. + commit 95abea46f86816fddfc9ff0abfa52880801461be -Merge: d0dfe5b a017062 +Merge: d0dfe5b5 a017062f Author: Field G. Van Zee Date: Sat Jul 23 15:38:33 2016 -0500 @@ -396,8 +1209,39 @@ Date: Fri Jul 22 17:02:59 2016 -0500 single-threaded execution. This new API is employed within functions such as bli_membrk_acquire_[mv]() and bli_membrk_release(). +commit 8ff2e069c48c12fd06b9c48c6b3aeb4ea9b0e6e1 +Author: Devin Matthews +Date: Fri Jul 22 16:22:26 2016 -0500 + + Add 4x unrolled variant for KNL microkernel. + +commit 9cb2ed9b0c25f31a22c1c9719b062fa665ad7adf +Author: Devin Matthews +Date: Fri Jul 22 16:10:30 2016 -0500 + + Git rid of one RBX update. + +commit 451bde076f0320d60cd2475cfb048ac4a2b798bb +Author: Devin Matthews +Date: Fri Jul 22 15:43:00 2016 -0500 + + Add some more knobs to twiddle for KNL microkernel. + +commit 8c6e621c099521e7a4d87e007bb8224faa5f33a3 +Author: Devin Matthews +Date: Fri Jul 22 15:05:15 2016 -0500 + + Make knl conform to new kernel dir structure. + +commit ce7214c6618d6f22f4ce2ee452336236916d1f30 +Merge: 119d0399 ce59f811 +Author: Devin Matthews +Date: Fri Jul 22 14:59:53 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + commit ce59f81108ec9aea918a7e77030da8acfdd397ce -Merge: ff41153 707a2b7 +Merge: ff41153f 707a2b7f Author: Field G. Van Zee Date: Fri Jul 22 14:48:14 2016 -0500 @@ -412,7 +1256,7 @@ Date: Fri Jul 22 13:49:44 2016 -0500 Somehow forgot the most important microkernel. commit 47ec045056351ac4f0791c071fa0daaa81699c8c -Merge: 08f1d6b ff41153 +Merge: 08f1d6b6 ff41153f Author: Devin Matthews Date: Fri Jul 22 13:45:23 2016 -0500 @@ -425,7 +1269,7 @@ Date: Fri Jul 22 13:44:37 2016 -0500 Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 -Merge: f9214ce e0d2fa0 +Merge: f9214ced e0d2fa0d Author: Field G. Van Zee Date: Fri Jul 22 13:21:03 2016 -0500 @@ -440,7 +1284,7 @@ Date: Fri Jul 22 12:56:51 2016 -0500 Relax alignment restrictions for haswell sgemm. commit f9214ced97392861f5a0ea72abfcf6f41faf674c -Merge: 413d62a 08666ea +Merge: 413d62ac 08666eaa Author: Field G. Van Zee Date: Fri Jul 22 12:16:39 2016 -0500 @@ -460,8 +1304,26 @@ Date: Fri Jul 22 11:07:34 2016 -0500 Change -openmp to -fopenmp for icc. +commit 119d0399428905053265f3aca1cc8cc1fde3b363 +Author: Devin Matthews +Date: Fri Jul 22 10:23:31 2016 -0500 + + Add 8x24 KNL kernel. + +commit b58cda9eba0c1e175460aae109baf792d29ba5bf +Merge: 318f063d 413d62ac +Author: Devin Matthews +Date: Tue Jul 19 14:09:09 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + + # Conflicts: + # frame/base/bli_threading.h + # frame/include/blis.h + # frame/thread/bli_thread.c + commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 -Merge: 31def12 413d62a +Merge: 31def12e 413d62ac Author: Field G. Van Zee Date: Thu Jul 14 11:01:06 2016 -0500 @@ -559,6 +1421,12 @@ Date: Fri Jun 17 14:08:35 2016 -0500 but possible divide-by-zero. - Updated function signature and prototype formatting in testsuite. +commit 318f063dcbd8b594969e401bc99146d24b01066a +Author: Devin Matthews +Date: Wed Jun 8 17:46:50 2016 -0500 + + Add new KNL microkernel derived from Haswell. + commit 096895c5d538a7f8817603d7cf28c52e99340def Author: Field G. Van Zee Date: Mon Jun 6 13:32:04 2016 -0500 @@ -592,7 +1460,7 @@ Date: Mon Jun 6 13:32:04 2016 -0500 in the wrong order, which was recently fixed. commit 232530e88ff99f37abcae5b6fb5319a9a375a45f -Merge: 4bcabd1 eef37f8 +Merge: 4bcabd1b eef37f8b Author: Tyler Michael Smith Date: Wed Jun 1 15:14:10 2016 -0500 @@ -700,6 +1568,18 @@ Date: Tue May 17 15:20:16 2016 -0500 store the unrolled 30xk kernel in the array for use (on knc, for example). Note: This should have been done a long time ago. +commit e3bd5ca64ae7c190ba689396c0de687b829a11fe +Author: Devin Matthews +Date: Thu May 12 20:54:13 2016 -0500 + + Fix SIMD definitions in KNL config, and a couple of fixes to C update. + +commit 4fe02e3d497995d94d34d3fcf5af895084cfc8b9 +Author: Devin Matthews +Date: Thu May 12 20:53:58 2016 -0500 + + Move bli_kernel.h before bli_threading.h in order of inclusion in blis.h. + commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 Author: Field G. Van Zee Date: Wed May 11 16:09:49 2016 -0500 @@ -727,7 +1607,7 @@ Date: Wed May 11 16:02:30 2016 -0500 #includes an "f2c.h" header. commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 -Merge: 4dcd37e 7c604e1 +Merge: 4dcd37eb 7c604e1c Author: Tyler Michael Smith Date: Wed May 11 10:47:11 2016 -0500 @@ -741,14 +1621,28 @@ Date: Tue May 10 16:28:59 2016 -0500 fixing knc simd align size +commit 619dee0daec3474b4e5a55df90a61aabcae194f2 +Merge: b790b3d9 7c604e1c +Author: Devin Matthews +Date: Tue May 10 12:13:24 2016 -0500 + + Merge branch 'move_simd_defs' into knl + commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 Author: Devin Matthews Date: Tue May 10 12:11:55 2016 -0500 Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. +commit b790b3d9e1820f3b691676de48c291cae083452d +Merge: 4f8c05c9 a7be2d28 +Author: Devin Matthews +Date: Tue May 10 11:49:47 2016 -0500 + + Merge branch 'master' into knl + commit a7be2d28e8930b154d0da1d6929b54a96e210af6 -Merge: 97b512e 4b1e55e +Merge: 97b512ef 4b1e55ed Author: Field G. Van Zee Date: Tue May 10 11:48:51 2016 -0500 @@ -840,7 +1734,7 @@ Date: Wed Apr 27 14:13:46 2016 -0500 bdbda6e, to tabs. commit 4ea419c72c789825e1f93a1eee88219bbf873930 -Merge: f1e9be2 bdbda6e +Merge: f1e9be2a bdbda6e6 Author: Field G. Van Zee Date: Tue Apr 26 12:50:45 2016 -0500 @@ -870,7 +1764,7 @@ Date: Fri Apr 22 15:34:02 2016 -0500 in my local working copy for longer than I can remember. commit aa0bceec277938328dabeb744680623f24fb0b61 -Merge: 4136553 e2784b4 +Merge: 4136553f e2784b4c Author: Field G. Van Zee Date: Fri Apr 22 12:01:31 2016 -0500 @@ -890,8 +1784,14 @@ Date: Fri Apr 22 11:53:53 2016 -0500 - Changed the definition of bli_cntx_obj_clear() so that the clearing occurs via a single call to memset(). +commit 4f8c05c9e2ef4cbb82b35a3ebf1f0a0ac665830e +Author: Devin Matthews +Date: Thu Apr 21 10:00:59 2016 -0500 + + Rearrange KNL dgemm kernel again to streamline usage of ymm register. sgemm and dgemm now both working with Intel SDE. + commit e2784b4c921f706e756df3e146e20a4cb63f53e3 -Merge: dd0ab1d a9b6c3a +Merge: dd0ab1d9 a9b6c3ab Author: Field G. Van Zee Date: Wed Apr 20 18:34:09 2016 -0500 @@ -900,7 +1800,7 @@ Date: Wed Apr 20 18:34:09 2016 -0500 Change CBLAS integer type to f77_int commit a9b6c3abda6222a8b240361643932e83cf726c4f -Merge: e4c54c8 dd0ab1d +Merge: e4c54c81 dd0ab1d9 Author: Devin Matthews Date: Wed Apr 20 16:00:10 2016 -0500 @@ -927,8 +1827,14 @@ Date: Wed Apr 20 14:38:23 2016 -0500 added equivalent cpp query macros to bli_cntx.h. - Added 'bli_config.h' to .gitignore. +commit 7193230f7d35edbd1d2f77842a613971f1603463 +Author: Devin Matthews +Date: Wed Apr 20 09:37:30 2016 -0500 + + Work around missing VPMULLQ on KNL. + commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb -Merge: eb2f18e 0e1a982 +Merge: eb2f18e4 0e1a9821 Author: Field G. Van Zee Date: Tue Apr 19 15:04:33 2016 -0500 @@ -936,6 +1842,12 @@ Date: Tue Apr 19 15:04:33 2016 -0500 Add configure options and generate bli_config.h automatically. +commit bd44cf13e886069bc66c10ac0db178be96629a0d +Author: Devin Matthews +Date: Tue Apr 19 13:43:04 2016 -0500 + + Fix copy-paste errors in KNL kernels. + commit eb2f18e4844d985715df20798f50f9cc12e3b5ad Author: Field G. Van Zee Date: Tue Apr 19 12:50:32 2016 -0500 @@ -956,18 +1868,56 @@ Date: Tue Apr 19 11:44:37 2016 -0500 Lastly, support for OMP in clang has been added (closes #56). +commit a11eec05928ddc5c43fa5dbcd35f2edd24ff35a1 +Author: Devin Matthews +Date: Mon Apr 18 13:13:36 2016 -0500 + + Add sgemm ukernels for KNL. vpmullq is not implemented on KNL -- needs workaround. + commit ff84469a4575f1ef8a0010046fde52240a312cae Author: Field G. Van Zee Date: Mon Apr 18 12:29:09 2016 -0500 Applied various compilation fixes to bgq kernels. +commit c38e0dab05b2dc36672eab96e1248fb7fb2d785b +Merge: bd5e2296 cbcd0b73 +Author: Devin Matthews +Date: Mon Apr 18 10:21:35 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + +commit bd5e2296e98e042c31f1e8ece2c1ca8e4bdc2d4c +Merge: 4745def0 49f85177 +Author: Devin Matthews +Date: Mon Apr 18 10:15:22 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + +commit 4745def0c87377ae83ad73ac514d7de08a96b2ac +Author: Devin Matthews +Date: Mon Apr 18 10:15:05 2016 -0500 + + Add 64-bit offset vector so we can use vgatherqpd. + +commit 49f85177f886f38889b60503a4e12fa7f04be1fd +Author: Devin Matthews +Date: Mon Apr 18 10:14:11 2016 -0500 + + KNL ukernel compiles with gcc. + commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f Author: Tyler Michael Smith Date: Mon Apr 18 03:12:57 2016 -0500 Changing ifdef for OSX pthread barriers +commit 58b2c3cf040134d1be913c585a3c6905629116c0 +Author: Devin Matthews +Date: Sat Apr 16 16:12:24 2016 -0500 + + Rewrite of KNL kernel in GNU extended asm syntax. + commit dd62080cea78f3a23616200d6640e52c102b2bb9 Author: Field G. Van Zee Date: Fri Apr 15 11:15:41 2016 -0500 @@ -984,7 +1934,7 @@ Date: Fri Apr 15 11:15:41 2016 -0500 website. commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a -Merge: 4320b72 4169467 +Merge: 4320b725 41694675 Author: Field G. Van Zee Date: Thu Apr 14 12:56:36 2016 -0500 @@ -1182,8 +2132,34 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. +commit dd856c2cb75a2221a503a73dde27790c34b91570 +Author: Devin Matthews +Date: Mon Apr 11 10:39:18 2016 -0500 + + Translated MIC kernel to KNL and cleaned up a bit. Only real change is lack of swizzle modifiers for FMA instructions (used bcast from memory instead). + +commit 7f27431d3fffdda99c282ec412731d0a90cb32a7 +Author: Devin Matthews +Date: Fri Apr 8 10:04:39 2016 -0500 + + Copy mic kernel to knl for transliteration. + +commit f8f02f0334ac020021e15a415bcd33aeea01deb4 +Merge: 32c92d94 d1f8e5d9 +Author: Devin Matthews +Date: Wed Apr 6 11:37:05 2016 -0500 + + Merge branch 'master' into const_correctness + +commit 32c92d945c55708da0eb63be1771f8c5430e3910 +Merge: 62914ccb 20af937b +Author: Devin Matthews +Date: Wed Apr 6 11:36:02 2016 -0500 + + Merge branch 'master' into const_correctness + commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 -Merge: 20af937 c11d28e +Merge: 20af937b c11d28ee Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -1198,7 +2174,7 @@ Date: Sat Apr 2 21:15:48 2016 +0200 cgemm µkernel for bulldozer : bug correction for k%4 != 0 commit 20af937b57f82bb3acb09418d5c0206e1b24f2c7 -Merge: 36c3abb fc61a11 +Merge: 36c3abb0 fc61a114 Author: Field G. Van Zee Date: Thu Mar 31 14:37:30 2016 -0500 @@ -1219,7 +2195,7 @@ Date: Thu Mar 31 10:45:48 2016 -0500 Adjust paths in common.mk to support building from testsuite dir. commit 36c3abb05fecb02d4a9ab13b2b69d133adf34583 -Merge: 64b41fa 917ce75 +Merge: 64b41fa5 917ce754 Author: Field G. Van Zee Date: Thu Mar 31 10:26:17 2016 -0500 @@ -1245,8 +2221,15 @@ Date: Wed Mar 30 22:03:09 2016 +0200 cgemm & zgemm micro-kernels for FMA4 instruction set (bulldozer configuration), based on x86_64/avx micro-kernel +commit 62914ccbcdb3c594f065dcfa65bd7e7b95c79283 +Merge: bbf704bf 64b41fa5 +Author: Devin Matthews +Date: Tue Mar 29 15:24:25 2016 -0500 + + Merge branch 'master' into const_correctness + commit 64b41fa554dff44b2f9ad48901b67c63836407a8 -Merge: 1b09e34 0171ad5 +Merge: 1b09e343 0171ad58 Author: Field G. Van Zee Date: Tue Mar 29 15:19:41 2016 -0500 @@ -1267,7 +2250,7 @@ Date: Mon Mar 28 13:55:06 2016 -0500 Add icc and clang support for Intel architectures, fixes #47. 2bd036f fixes #49 BTW. commit 3090fff64cc87ff2519a09f38e6b8699cf3cba11 -Merge: 8624e36 4ca5d5b +Merge: 8624e365 4ca5d5b1 Author: Field G. Van Zee Date: Mon Mar 28 12:36:25 2016 -0500 @@ -1276,14 +2259,14 @@ Date: Mon Mar 28 12:36:25 2016 -0500 sgemm micro-kernel for FMA4 instruction set commit e6e566426ac3ded7ef87cd8ff9be98accfdc4acc -Merge: 469429e 8624e36 +Merge: 469429ec 8624e365 Author: Devin Matthews Date: Sat Mar 26 14:10:15 2016 -0500 Merge branch 'master' into more_config_opts commit 8624e36543160739d954c4dbcc5a5594458f3a12 -Merge: a315833 2bd036f +Merge: a315833f 2bd036f1 Author: Field G. Van Zee Date: Sat Mar 26 13:56:28 2016 -0500 @@ -1310,7 +2293,7 @@ Date: Fri Mar 25 17:22:58 2016 -0500 Add threading option to configure. commit ad43eab4c7899d56d8d7caa6e2d92bc0581ea5a5 -Merge: 9452bdb 2bd036f +Merge: 9452bdb3 2bd036f1 Author: Devin Matthews Date: Fri Mar 25 15:00:02 2016 -0500 @@ -1328,8 +2311,14 @@ Date: Fri Mar 25 12:16:49 2016 -0500 Fix configuration issue where instruction set flags are not specified for debug builds. +commit bbf704bf7501411964a63a68f1af541f612cf92d +Author: Devin Matthews +Date: Fri Mar 25 09:55:35 2016 -0500 + + Add missing const to bli_read_nway_from_env. + commit a315833f067944fb0bc14cf60f0c7dcb5dc897b6 -Merge: 1d1a426 af92773 +Merge: 1d1a426d af92773f Author: Field G. Van Zee Date: Thu Mar 24 12:30:21 2016 -0500 @@ -1343,8 +2332,20 @@ Date: Wed Mar 23 22:07:02 2016 +0100 Updated and improved ARMv8 micro-kernels. +commit a4d7729776d17d9bdf2341eacd70b9770b9ba8d2 +Author: Devin Matthews +Date: Mon Mar 21 09:55:21 2016 -0500 + + Set default value for debug_type variable. + +commit 0e2447fa55d8c5fa2b1fc4150073512495c5f9eb +Author: Devin Matthews +Date: Thu Mar 17 16:32:05 2016 -0500 + + Add const correctness to auxinfo_t struct (microkernels need update theoretically). + commit 1d1a426d18ec03754021456862a1f4d1dfec1fbf -Merge: 5a978ff d226dfa +Merge: 5a978fff d226dfa0 Author: Field G. Van Zee Date: Mon Mar 7 15:17:53 2016 -0600 @@ -1364,7 +2365,7 @@ Date: Sat Mar 5 16:18:14 2016 -0600 4) Add make V=[0,1] option to control build verbosity. commit 5a978fffdb8f09a81c89541d541d4a6830cd70a4 -Merge: adb2b4e 63e2642 +Merge: adb2b4e0 63e26423 Author: Field G. Van Zee Date: Fri Mar 4 17:26:58 2016 -0600 @@ -1409,7 +2410,7 @@ Date: Mon Feb 29 21:53:12 2016 +0100 symbolic link for bulldozer configuration to kernels commit 2dc5c0ae038ed175fab85751803ada05734d1ba1 -Merge: f2809fc 3d0fae8 +Merge: f2809fc5 3d0fae81 Author: Field G. Van Zee Date: Mon Feb 29 12:22:51 2016 -0600 @@ -1418,7 +2419,7 @@ Date: Mon Feb 29 12:22:51 2016 -0600 Add symlink from config/bulldozer/kernels to kernels/x86_64/bulldozer commit f2809fc5f74466c755da6a5b4632853e634060b5 -Merge: f86b94f 8624a33 +Merge: f86b94f2 8624a33c Author: Field G. Van Zee Date: Sat Feb 27 13:06:03 2016 -0600 @@ -1542,7 +2543,7 @@ Date: Tue Nov 3 10:30:08 2015 -0600 smart enough to perform this optimization automatically. commit 0694b722f7e4df00efb32639095a2aca80e67f52 -Merge: 3e116f0 33557ec +Merge: 3e116f0a 33557ecc Author: Field G. Van Zee Date: Mon Nov 2 17:24:25 2015 -0600 @@ -1621,7 +2622,7 @@ Date: Fri Oct 30 18:25:04 2015 -0500 micro-kernels, and trsm_ll macro-kernel. commit 46294d80e5a79c598e200e1c8ec2a642ff839971 -Merge: d3159c5 a0a7b85 +Merge: d3159c57 a0a7b85a Author: Field G. Van Zee Date: Tue Oct 27 12:41:23 2015 -0500 @@ -1636,7 +2637,7 @@ Date: Tue Oct 27 08:59:15 2015 +0000 Fixed incomplete code in the double precision ARMv8 microkernel. commit d3159c5740c9ee7f8c0b661003aab6f00646ad6f -Merge: b489152 7e03e45 +Merge: b489152e 7e03e45b Author: Field G. Van Zee Date: Wed Oct 21 14:54:00 2015 -0500 @@ -1649,7 +2650,7 @@ Date: Wed Oct 21 14:53:17 2015 -0500 Use vzeroall in haswell micro-kernels. commit 7e03e45bfe6c27c4fdbf06b1caa7f49e9a5fef49 -Merge: 77ddb0b 4f88c29 +Merge: 77ddb0b1 4f88c29f Author: Field G. Van Zee Date: Wed Oct 14 13:26:07 2015 -0500 @@ -1664,7 +2665,7 @@ Date: Wed Oct 14 12:57:50 2015 -0500 Detect Intel Broadwell (using Haswell config). commit 4b0ac1a9984a93f7ad4369b10fca63991107d9f5 -Merge: fe3e355 77ddb0b +Merge: fe3e355c 77ddb0b1 Author: Zhang Xianyi Date: Wed Oct 14 12:51:05 2015 -0500 @@ -1771,7 +2772,7 @@ Date: Thu Sep 24 12:14:03 2015 -0500 bli_obj_row_off(), bli_obj_col_off(). commit fe3e355c9c5a6f65b8736b009e2d501b62a83ea1 -Merge: efa641e 4dd9dd3 +Merge: efa641e3 4dd9dd3e Author: Zhang Xianyi Date: Fri Aug 21 14:38:36 2015 -0500 @@ -1817,7 +2818,7 @@ Date: Wed Jul 29 13:31:09 2015 -0500 Version file update (0.1.8) commit ef0fbbbdb6148b96938733fce72cb4ed7dad685e -Merge: fdfe14f d4b8913 +Merge: fdfe14f1 d4b89136 Author: Field G. Van Zee Date: Thu Jul 9 13:54:54 2015 -0500 @@ -2085,7 +3086,7 @@ Date: Fri Apr 3 16:44:32 2015 -0500 - Added ACML support to test/3m4m driver Makefile and runme.sh script. commit a32f7c49ca4ea869d2a6c66818780f4321743d67 -Merge: 349e075 4bfd1ce +Merge: 349e075a 4bfd1ce8 Author: Field G. Van Zee Date: Fri Apr 3 08:28:11 2015 -0500 @@ -2279,7 +3280,7 @@ Date: Fri Feb 20 15:24:27 2015 -0600 return blocksizes from one of the induced methods' blocksize objects. commit 411e637ee7d1083a84f58f08938d51e63d7c3c9a -Merge: c2569b8 fc0b771 +Merge: c2569b88 fc0b7712 Author: Tyler Michael Smith Date: Fri Feb 20 20:39:25 2015 -0600 @@ -2345,14 +3346,14 @@ Date: Thu Feb 19 14:27:09 2015 -0600 the sandybridge configuration. commit 493087d730f01d5169434f461644e5633f48a42f -Merge: 650d2a6 2502129 +Merge: 650d2a6f 25021299 Author: Field G. Van Zee Date: Wed Feb 18 09:45:51 2015 -0600 Merge branch 'master' of github.com:flame/blis commit 25021299b670775df8ca9c87910c63d7e74ed946 -Merge: fe2b8d3 f05a576 +Merge: fe2b8d39 f05a5763 Author: Field G. Van Zee Date: Wed Feb 11 20:03:21 2015 -0600 @@ -2487,7 +3488,7 @@ Date: Tue Dec 16 11:27:50 2014 -0600 Added 4m_1b to test/3m4m test driver and script. commit 785d480805fc0d6f4251b5499933515740b6b2a7 -Merge: 9456f33 4156c08 +Merge: 9456f330 4156c088 Author: Field G. Van Zee Date: Fri Dec 12 14:34:19 2014 -0600 @@ -2539,7 +3540,7 @@ Date: Tue Dec 9 16:03:14 2014 -0600 leading us to this bug. commit 689f60a578b461119e9ea90c74f642b9eb79addb -Merge: bef24e6 483e4d6 +Merge: bef24e67 483e4d6a Author: Field G. Van Zee Date: Sun Dec 7 14:03:30 2014 -0600 @@ -2565,7 +3566,7 @@ Date: Wed Nov 26 18:00:56 2014 -0600 Barriers were inserted to fix this. commit 76bde44411f0e34266bab9d666a54ef22be97320 -Merge: e56e614 f3d729e +Merge: e56e6143 f3d729e5 Author: Field G. Van Zee Date: Wed Nov 26 17:25:24 2014 -0600 @@ -2610,7 +3611,7 @@ Date: Fri Nov 21 12:28:08 2014 -0600 - Updated comments on alignment of a1 and b1 to match wiki. commit 994429c6881b2ade92d9d7949bcaebfbf2cc65eb -Merge: 58796ab 694029d +Merge: 58796abd 694029d9 Author: Field G. Van Zee Date: Thu Nov 20 13:55:35 2014 -0600 @@ -2857,7 +3858,7 @@ Date: Fri Oct 10 10:01:45 2014 -0500 - Updated sandybridge configuration accordingly. commit 23ce7ee542a12ca40b4b6090ad2558d180e16d37 -Merge: 99fd9a3 7a8ad47 +Merge: 99fd9a39 7a8ad47f Author: Field G. Van Zee Date: Thu Oct 9 16:41:22 2014 -0500 @@ -2918,7 +3919,7 @@ Date: Mon Sep 29 14:56:36 2014 -0500 Fixed bug when packing anywhere besides in blk_var_1 for gemm. commit 614a4afc9272adb47e5a8b83b39d56c2804d95d6 -Merge: b541b66 4a7df04 +Merge: b541b667 4a7df04e Author: Tyler Smith Date: Fri Sep 26 10:49:57 2014 -0500 @@ -3008,7 +4009,7 @@ Date: Wed Sep 17 11:10:07 2014 -0500 implementations. Thanks to Devin Matthews for reporting this bug. commit 870761eb902e4866090d1d3446a345df3d6d4599 -Merge: e9899be a2b59a3 +Merge: e9899be0 a2b59a37 Author: Field G. Van Zee Date: Tue Sep 16 18:20:49 2014 -0500 @@ -3304,7 +4305,7 @@ Date: Thu Aug 28 11:55:12 2014 -0500 we now pass in the pack schema itself. commit a0ff6066e06075ab5f92b19247b39b92ed15f1bf -Merge: c4c99c4 d40b32b +Merge: c4c99c48 d40b32bc Author: Field G. Van Zee Date: Sun Aug 24 15:56:21 2014 -0500 @@ -3325,7 +4326,7 @@ Date: Sun Aug 24 15:52:22 2014 -0500 level-2 or level-3 operation. commit d40b32bc24ffbae24123e054307b3138969bb095 -Merge: 9331f79 6c25c37 +Merge: 9331f794 6c25c379 Author: Field G. Van Zee Date: Sun Aug 24 13:46:36 2014 -0500 @@ -3343,7 +4344,7 @@ Date: Sun Aug 24 13:44:10 2014 -0500 ukernels in commit 4cc2b46. commit 9331f79443223fe267676ee54c439e1ed320380c -Merge: 7fc48a7 670b639 +Merge: 7fc48a7d 670b6392 Author: Field G. Van Zee Date: Sun Aug 24 10:54:21 2014 -0500 @@ -3427,7 +4428,7 @@ Date: Thu Aug 21 18:25:48 2014 -0500 those blocksizes at runtime. commit b541b667cabfa6d41b50ad1e49209651ee6812cc -Merge: 699a815 dd61307 +Merge: 699a8151 dd61307f Author: Tyler Smith Date: Wed Aug 20 14:44:51 2014 -0500 @@ -3654,7 +4655,7 @@ Date: Mon Aug 4 15:49:59 2014 -0500 - Updated blis.h to include necessary CBLAS-related headers. commit caab62dac0fb0bd0d674118f409c81680db94d29 -Merge: 383631b db97ce9 +Merge: 383631b5 db97ce97 Author: Field G. Van Zee Date: Sun Aug 3 14:36:18 2014 -0500 @@ -3779,7 +4780,7 @@ Date: Sun Jul 27 18:20:12 2014 -0500 Version file update (0.1.4) commit acff74041bf02c7b9fdfa24b507bca782a4c5fce -Merge: cdb9413 47b243e +Merge: cdb9413e 47b243ef Author: Tyler Smith Date: Wed Jul 23 15:07:30 2014 -0500 @@ -3807,7 +4808,7 @@ Date: Wed Jul 23 13:41:13 2014 -0500 - Comment update. commit 3e7b0db5b0e24f5fd66c60bacabc019885ddbec5 -Merge: 2f8a357 ed3e33d +Merge: 2f8a357d ed3e33d5 Author: Tyler Smith Date: Wed Jul 23 13:40:44 2014 -0500 @@ -3853,7 +4854,7 @@ Date: Tue Jul 22 14:36:02 2014 -0500 matrix real-valued. commit 8965a965931318619ceaebd7c32edccf3022d0c7 -Merge: 1785efb 5b73e80 +Merge: 1785efb5 5b73e80b Author: Field G. Van Zee Date: Tue Jul 22 14:34:32 2014 -0500 @@ -3870,7 +4871,7 @@ Date: Tue Jul 22 14:33:01 2014 -0500 - Changed setd front-end call of scald_check() to setd_check(). commit 5b73e80b71c054c1945a06aff044ef629bc1a9a0 -Merge: a41e68e 20690fe +Merge: a41e68e0 20690fe3 Author: Field G. Van Zee Date: Fri Jul 18 12:21:20 2014 -0500 @@ -3942,7 +4943,7 @@ Date: Mon Jul 14 16:05:03 2014 -0500 2012). commit fcec68cda3f6e90ae055e7304e6674c1c5c8d010 -Merge: 94c0df7 4a20ed1 +Merge: 94c0df79 4a20ed1a Author: Field G. Van Zee Date: Mon Jul 14 11:35:34 2014 -0500 @@ -3977,7 +4978,7 @@ Date: Sun Jul 13 22:50:56 2014 -0700 Emscripten port commit 4a20ed1a3f5e9e5232df30aa0e568e6c00c56ce1 -Merge: 6a515e9 8ccdfae +Merge: 6a515e98 8ccdfaef Author: Field G. Van Zee Date: Sun Jul 13 17:45:01 2014 -0500 @@ -4076,7 +5077,7 @@ Date: Tue Jul 8 10:25:27 2014 -0500 - Added *.so files to '.gitignore'. commit 6c65e9a58fe55990ebb99ec3986443e18af35338 -Merge: cb12e45 daca500 +Merge: cb12e456 daca500d Author: Field G. Van Zee Date: Tue Jul 8 10:13:49 2014 -0500 @@ -4095,7 +5096,7 @@ Date: Tue Jul 8 10:07:46 2014 -0500 uninitialized. Thanks to Tony Kelman for isolating this bug. commit daca500db5e2448ba0da8047b75eb0f88d9f40e3 -Merge: ab3bc91 4702350 +Merge: ab3bc915 47023502 Author: Tyler Smith Date: Thu Jul 3 12:52:52 2014 -0500 @@ -4200,7 +5201,7 @@ Date: Mon Jun 23 10:42:29 2014 -0500 Removed 'version' from .gitignore file. commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 -Merge: 7101a8e b693b0c +Merge: 7101a8ee b693b0cd Author: Field G. Van Zee Date: Mon Jun 23 10:39:05 2014 -0500 @@ -4215,7 +5216,7 @@ Date: Sun Jun 22 13:44:25 2014 -0700 [SC]AXPY kernels for PNaCl commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162 -Merge: ad48dca 020a831 +Merge: ad48dca2 020a831b Author: Field G. Van Zee Date: Thu Jun 19 21:46:50 2014 -0500 @@ -4278,7 +5279,7 @@ Date: Sun Jun 15 06:27:37 2014 -0400 SGEMM and DGEMM kernels for PNaCl commit ad48dca22913a363899f0bef45553898718eebb1 -Merge: ee2b679 7118f87 +Merge: ee2b6792 7118f87e Author: Field G. Van Zee Date: Sat Jun 14 15:10:13 2014 -0500 @@ -4327,7 +5328,7 @@ Date: Wed May 21 11:34:42 2014 -0500 reporting this bug. commit 77a2d8dac8b242d7a202c9aabda3927ab68cf987 -Merge: 8c5d607 21fb089 +Merge: 8c5d6071 21fb0893 Author: Field G. Van Zee Date: Tue May 20 09:53:19 2014 -0500 @@ -4395,7 +5396,7 @@ Date: Wed Apr 30 12:28:00 2014 -0500 Replaced register blocksize hack with querying the register blocksize for determining parallelism granularity commit f4fdfe8fc573553eb36795b79cdf681270dab71b -Merge: 31bb065 8c5d607 +Merge: 31bb065b 8c5d6071 Author: Tyler Smith Date: Wed Apr 30 11:46:35 2014 -0500 @@ -4435,7 +5436,7 @@ Date: Mon Apr 28 16:48:25 2014 -0500 to Jack Poulson for reporting this bug. commit 31bb065ba40ae0c5a614e743b8025abca012b99e -Merge: 20e2443 7c61959 +Merge: 20e24430 7c619599 Author: Tyler Smith Date: Wed Apr 23 12:30:19 2014 -0500 @@ -4535,7 +5536,7 @@ Date: Fri Apr 4 10:22:48 2014 -0500 Also made herk IC and JC loops do weighted partitioning commit 2b6848b2397d6d84ca4e5f792fc51ad05e351a36 -Merge: 4e3eb39 21a0efb +Merge: 4e3eb39a 21a0efb3 Author: Tyler Smith Date: Fri Apr 4 09:54:54 2014 -0500 @@ -4654,7 +5655,7 @@ Date: Mon Mar 24 15:21:42 2014 -0500 a_next and b_next point to the current micropanels in trmm commit 23d9eab354fbc88165889832955e126772bf8488 -Merge: 5d5dc2e fd3e32a +Merge: 5d5dc2ee fd3e32a5 Author: Tyler Smith Date: Thu Mar 20 16:54:35 2014 -0500 @@ -4796,7 +5797,7 @@ Date: Mon Mar 10 15:47:28 2014 -0500 Added single threaded thread info data structures specifically for gemm and packm commit 0e8677761175189583ca7d855e24b2bbdd2dada8 -Merge: 2e727a0 b3bff63 +Merge: 2e727a02 b3bff631 Author: Tyler Smith Date: Mon Mar 10 15:16:21 2014 -0500 @@ -4829,14 +5830,14 @@ Date: Mon Mar 3 14:31:44 2014 -0600 are currently implemented in terms of isinf() and isnan() from math.h. commit b3bff631eadf98b15cb422fb4a8e2f855c23e8a7 -Merge: 2c158fb e8757b0 +Merge: 2c158fb8 e8757b03 Author: Tyler Smith Date: Thu Feb 27 16:53:24 2014 -0600 Merge https://github.com/flame/blis commit 2c158fb885c27f7b599dc1e85b57edd684f19223 -Merge: e4738c4 c2b2ab6 +Merge: e4738c48 c2b2ab62 Author: Tyler Smith Date: Thu Feb 27 16:46:23 2014 -0600 @@ -4896,7 +5897,7 @@ Date: Thu Feb 27 14:09:19 2014 -0600 Fixed bug in thread trees commit ac5a2de1d17ffd460b00fee9757898525a09abae -Merge: 01b125e bd3c7ec +Merge: 01b125e8 bd3c7ecf Author: Tyler Smith Date: Thu Feb 27 11:59:33 2014 -0600 @@ -4973,14 +5974,14 @@ Date: Tue Feb 25 13:34:56 2014 -0600 only the real gemm micro-kernel. commit 15b51e990f1d21333b5f7af97c211756247336e5 -Merge: 6363a9f fc04b5e +Merge: 6363a9f6 fc04b5eb Author: Field G. Van Zee Date: Fri Feb 21 09:04:32 2014 -0600 Merge branch 'master' of github.com:fgvanzee/blis commit fc04b5eb69868c341ce03f5ef1f02de4b8c121b0 -Merge: b29e1c2 d1813c9 +Merge: b29e1c2b d1813c9d Author: Field G. Van Zee Date: Fri Feb 21 09:04:13 2014 -0600 @@ -5023,7 +6024,7 @@ Date: Wed Feb 19 17:00:52 2014 -0600 - Various other minor changes to facilitate 4m/3m methods. commit b29e1c2b278c177e104c84ba462820ee8296df6c -Merge: ee60377 bd3c7ec +Merge: ee60377e bd3c7ecf Author: Field G. Van Zee Date: Fri Feb 14 14:11:54 2014 -0600 @@ -5676,7 +6677,7 @@ Date: Tue Dec 3 16:08:30 2013 -0600 beta are applied to the attached scalars. commit 992de486d6f23e69a623abd15ae77d7881d13871 -Merge: 9552e6e fd4ac63 +Merge: 9552e6ee fd4ac636 Author: Field G. Van Zee Date: Mon Dec 2 13:58:46 2013 -0600 @@ -5742,7 +6743,7 @@ Date: Mon Nov 18 18:11:07 2013 -0600 that already existed in kernels/x86_64/core2-sse3/3. commit 85e7e02ea3a9190b6fcff5d46b00d41c79cb1242 -Merge: 67761e2 7072005 +Merge: 67761e22 70720054 Author: Field G. Van Zee Date: Mon Nov 18 12:02:00 2013 -0600 @@ -6513,7 +7514,7 @@ Date: Thu Aug 1 11:24:23 2013 -0500 dimension of the gemm macro-kernel. commit f8980edf9c318453bb1962ac4939c06bf11e6d5e -Merge: 67a8b94 6e7e452 +Merge: 67a8b949 6e7e4523 Author: Field G. Van Zee Date: Fri Jul 26 11:14:27 2013 -0500 From 43007f7b65ec7926cbbfc39965ff733fa251c15f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 2 May 2017 16:48:43 -0500 Subject: [PATCH 11/64] Fixed stray parentheses in README citations. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d7b0ce34..c40005221 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ for determining blocksize parameters in BLIS: ``` A fifth paper, submitted to ACM TOMS, begins the study of so-called -[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)): +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf): ``` @article{BLIS5, @@ -293,7 +293,7 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called ``` A sixth paper, submitted to ACM TOMS, revisits the topic of the previous -article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf)): +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf): ``` @article{BLIS6, From 0df3541f54b7fe0c604ab2ec47ba814f12391798 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 2 May 2017 19:25:21 -0700 Subject: [PATCH 12/64] allow KNL build without hbwmalloc.h (i.e. emulated) we want to be able to run BLIS KNL binaries on non-KNL machines via SDE. although it is possible to install hbwmalloc implementation on such systems, it is easier not to, since obviously the performance of SDE execution is not representative so there is no reason to emulate HBW allocation. --- config/knl/bli_kernel.h | 11 +++++++++++ config/knl/make_defs.mk | 13 +++++++++++-- configure | 3 +++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/config/knl/bli_kernel.h b/config/knl/bli_kernel.h index e32954973..46b1cb4f4 100644 --- a/config/knl/bli_kernel.h +++ b/config/knl/bli_kernel.h @@ -43,11 +43,22 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 +#ifdef BLIS_NO_HBWMALLOC + +#include + +#define BLIS_MALLOC_POOL malloc +#define BLIS_FREE_POOL free + +#else + #include #define BLIS_MALLOC_POOL hbw_malloc #define BLIS_FREE_POOL hbw_free +#endif + //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 6a750223d..449aeb0bb 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -65,6 +65,10 @@ else COPTFLAGS := -O3 endif +ifeq ($(DEBUG_TYPE),sde) +CPPROCFLAGS += -DBLIS_NO_HBWMALLOC +endif + CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) @@ -95,12 +99,17 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -ifeq ($(CC_VENDOR),icc) + +ifneq ($(DEBUG_TYPE),sde) LDFLAGS := -lmemkind else -LDFLAGS := -lmemkind -lm +LDFLAGS := endif +ifneq ($(CC_VENDOR),icc) +LDFLAGS += -lm +endif + # end of ifndef MAKE_DEFS_MK_INCLUDED conditional block diff --git a/configure b/configure index 2358575f6..de7d1e96e 100755 --- a/configure +++ b/configure @@ -458,6 +458,9 @@ main() if [ -n "${debug_flag}" ]; then if [ "x${debug_type}" = "xopt" ]; then echo "${script_name}: enabling debug symbols with optimizations." + elif [ "x${debug_type}" = "xsde" ]; then + debug_type='sde' + echo "${script_name}: enabling SDE processor emulation." else debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." From dd58c9545c877c3f7553eaebca7b5e9720a66f5d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 3 May 2017 15:04:51 -0500 Subject: [PATCH 13/64] Disable complex 3m/4m in testsuite by default. Details: - Disabled testsuite tests of all level-3 implementations based on 3m and 4m. This will improve testing runtime on Travis CI as well as for anyone manually running the testsuite using default test parameters. Thanks to Devin Matthews for suggesting this change. --- testsuite/input.general | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/testsuite/input.general b/testsuite/input.general index 9dba50df6..b7fbd6b58 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -29,13 +29,13 @@ sdcz # Datatype(s) to test: 500 # Problem size: maximum to test 100 # Problem size: increment between experiments # Complex level-3 implementations to test -1 # 3mh ('1' = enable; '0' = disable) -1 # 3m3 ('1' = enable; '0' = disable) -1 # 3m2 ('1' = enable; '0' = disable) -1 # 3m1 ('1' = enable; '0' = disable) -1 # 4mh ('1' = enable; '0' = disable) -1 # 4m1b ('1' = enable; '0' = disable) -1 # 4m1a ('1' = enable; '0' = disable) +0 # 3mh ('1' = enable; '0' = disable) +0 # 3m3 ('1' = enable; '0' = disable) +0 # 3m2 ('1' = enable; '0' = disable) +0 # 3m1 ('1' = enable; '0' = disable) +0 # 4mh ('1' = enable; '0' = disable) +0 # 4m1b ('1' = enable; '0' = disable) +0 # 4m1a ('1' = enable; '0' = disable) 1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: From fdc66f12d40754ff46179804bff592fddafbca02 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 4 May 2017 10:35:22 -0500 Subject: [PATCH 14/64] Setting any one of BLIS_NT_[IJ][CR] overrides BLIS_NUM_THEADS. Missing BLIS_NT_XX's are defaulted to 1. Fixes #123. --- frame/base/bli_cntx.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index f8cdf1fc4..673987bfd 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -784,12 +784,20 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, } } - jc = bli_env_read_nway( "BLIS_JC_NT", jc ); - //pc = bli_env_read_nway( "BLIS_KC_NT", 1 ); - pc = 1; - ic = bli_env_read_nway( "BLIS_IC_NT", ic ); - jr = bli_env_read_nway( "BLIS_JR_NT", jr ); - ir = bli_env_read_nway( "BLIS_IR_NT", ir ); + pc = 1; + + dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + + if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) + { + jc = (jc_env == -1 ? 1 : jc_env); + ic = (ic_env == -1 ? 1 : ic_env); + jr = (jr_env == -1 ? 1 : jr_env); + ir = (ir_env == -1 ? 1 : ir_env); + } #else From cf39d3ef3b29b8058c39fb4638c1a734fe64aaed Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 5 May 2017 15:06:56 -0500 Subject: [PATCH 15/64] Fixed a bug in norm1v, norm1m. Details: - Fixed a bug that manifested as improperly-computed 1-norm for vectors and matrices. This is one of the few operations in BLIS that does not have its own test module within the testsuite, hence why it went undetected for so long. The bad 1-norms were being used to normalize matrices in the testsuite after initialization, which led to some matrices containing a combination of "large" and "small" values. This tended to push the residuals computed after each test away from zero. In some cases, they were off *just* enough to the testsuite to label it a "failure". Many thanks to Jeff Hammond for reporting this bug. (Wonky details: the bug was due to improperly-defined level-0 scalar macros for abval2, an operation that computes the absolute square, or complex magnitude/modulus. Certain complex domain instances of abval2 were being incorrectly defined in terms of real-only solutions, leading to bad results. This level-0 operation forms the basis of norm1v/norm1m. absq2 was also affected, but almost nothing uses this operation.) --- frame/include/level0/bli_absq2s.h | 32 +++++++++++++++--------------- frame/include/level0/bli_abval2s.h | 32 +++++++++++++++--------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/frame/include/level0/bli_absq2s.h b/frame/include/level0/bli_absq2s.h index b6d7766df..9dcdad06f 100644 --- a/frame/include/level0/bli_absq2s.h +++ b/frame/include/level0/bli_absq2s.h @@ -41,27 +41,27 @@ // - The first char encodes the type of x. // - The second char encodes the type of a. -#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabsq2s( x, a ) bli_sabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabsq2s( x, a ) bli_sabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabsq2s( x, a ) bli_sabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabsq2s( x, a ) bli_dabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabsq2s( x, a ) bli_dabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabsq2s( x, a ) bli_dabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_scabsq2s( x, a ) bli_cabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabsq2s( x, a ) bli_cabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabsq2s( x, a ) bli_cabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabsq2s( x, a ) bli_zabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabsq2s( x, a ) bli_zabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabsq2s( x, a ) bli_zabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX diff --git a/frame/include/level0/bli_abval2s.h b/frame/include/level0/bli_abval2s.h index 7e0556940..6e0480790 100644 --- a/frame/include/level0/bli_abval2s.h +++ b/frame/include/level0/bli_abval2s.h @@ -43,25 +43,25 @@ #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabval2s( x, a ) bli_sabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabval2s( x, a ) bli_sabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabval2s( x, a ) bli_sabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabval2s( x, a ) bli_dabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabval2s( x, a ) bli_dabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabval2s( x, a ) bli_dabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } -#define bli_scabval2s( x, a ) bli_cabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabval2s( x, a ) bli_cabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabval2s( x, a ) bli_cabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabval2s( x, a ) bli_zabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabval2s( x, a ) bli_zabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabval2s( x, a ) bli_zabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX From 5fa4e9439c04f35f89dd7d26ff742cb2dadc3180 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 21:50:49 -0400 Subject: [PATCH 16/64] A bunch of shebang fixes from unportable /bin/bash to portable /usr/bin/env bash --- build/auto-detect/auto-detect.sh | 2 +- build/bump-version.sh | 2 +- build/check-test.sh | 4 ++-- build/gen-make-frags/gen-make-frag.sh | 2 +- build/mirror-tree.sh | 2 +- build/templates/license.sh | 2 +- build/update-version-file.sh | 2 +- config/armv7a/make_defs.mk | 2 +- config/armv8a/make_defs.mk | 2 +- config/bgq/make_defs.mk | 2 +- config/bulldozer/make_defs.mk | 2 +- config/carrizo/make_defs.mk | 2 +- config/cortex-a15/make_defs.mk | 2 +- config/cortex-a9/make_defs.mk | 2 +- config/dunnington/make_defs.mk | 2 +- config/emscripten/make_defs.mk | 2 +- config/haswell/make_defs.mk | 2 +- config/knl/make_defs.mk | 2 +- config/loongson3a/make_defs.mk | 2 +- config/mic/make_defs.mk | 2 +- config/piledriver/make_defs.mk | 2 +- config/pnacl/make_defs.mk | 2 +- config/power7/make_defs.mk | 2 +- config/reference/make_defs.mk | 2 +- config/sandybridge/make_defs.mk | 2 +- config/template/make_defs.mk | 2 +- configure | 2 +- version | 2 +- 28 files changed, 29 insertions(+), 29 deletions(-) diff --git a/build/auto-detect/auto-detect.sh b/build/auto-detect/auto-detect.sh index 9300e3b8b..345fc2f3a 100755 --- a/build/auto-detect/auto-detect.sh +++ b/build/auto-detect/auto-detect.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/bump-version.sh b/build/bump-version.sh index 35da91b97..6df894152 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/check-test.sh b/build/check-test.sh index 6277ada45..fa7b4779a 100755 --- a/build/check-test.sh +++ b/build/check-test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,4 +47,4 @@ if [ $? -eq 0 ]; then else echo "Test Pass" exit 0 -fi \ No newline at end of file +fi diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index e24af3005..77e6dd5c4 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/mirror-tree.sh b/build/mirror-tree.sh index bac7ad9a6..3aae9ce35 100755 --- a/build/mirror-tree.sh +++ b/build/mirror-tree.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/templates/license.sh b/build/templates/license.sh index a9fc4b9fb..06da737b6 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/update-version-file.sh b/build/update-version-file.sh index afa829e4a..38e8d2088 100755 --- a/build/update-version-file.sh +++ b/build/update-version-file.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 2b4125f3a..af114c379 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 3dc88e913..be2e32667 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 0f405102b..dfc96dc6c 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 90d14d56b..097f33702 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index fd6b84cb0..121b6c5e0 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 52ab7a7c9..d38f60304 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 52ab7a7c9..d38f60304 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index f8faa3b5b..4234a4657 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 45b210ab6..63f4733cf 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 4c144846d..5e2d32641 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 449aeb0bb..0db039eb7 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 2c7e9c58c..21f6b084a 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 339112570..43d4a27ad 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index db46bd124..6d29705bc 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 9e2a3b4c5..e080b5c51 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index da4e5bff1..c2cb7b1ca 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 4e856534d..02076e95c 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 0a779b188..cd916739d 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 98f3222e0..c538a4c4d 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/configure b/configure index de7d1e96e..7aabc5b78 100755 --- a/configure +++ b/configure @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/version b/version index ee1372d33..7a8771f94 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.2 +0.2.1-115 From f5131e1e49167f948bddd714bb1af1761829c212 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:03:23 -0400 Subject: [PATCH 17/64] Indeed once can compile for carrizo also using clang. --- config/carrizo/make_defs.mk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 121b6c5e0..ef6435498 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -47,9 +47,12 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else $(error gcc is required for this configuration.) endif +endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L From 91f897073ec0df3330ede449c4d6af8158266ae3 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:06:59 -0400 Subject: [PATCH 18/64] Correct error message. --- config/carrizo/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index ef6435498..76e74d67a 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -50,7 +50,7 @@ endif ifeq ($(CC_VENDOR),gcc) ifeq ($(CC_VENDOR),clang) else -$(error gcc is required for this configuration.) +$(error gcc or clang are required for this configuration.) endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). From 7541d46e2ba8659bb2e36b444edef112fefa1345 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:12:12 -0400 Subject: [PATCH 19/64] Mark bulldozer compilable w/ clang. --- config/bulldozer/make_defs.mk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 097f33702..b6fd06801 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -47,8 +47,11 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). From a75b05c23dc786a1fdc45dc1627a5ce2299f1a7b Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:23:27 -0400 Subject: [PATCH 20/64] Mark piledriver compilable w/ clang. --- config/piledriver/make_defs.mk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 6d29705bc..81b3ca2f1 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -47,8 +47,11 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). From 0579dfea0bcfbb90ebc073fcf78b92a5cf7238e1 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 22:58:07 -0400 Subject: [PATCH 21/64] Restore version. --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index 7a8771f94..ee1372d33 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1-115 +0.2.2 From 169fb05f225c2f060265bcaa872f7f80dc638b70 Mon Sep 17 00:00:00 2001 From: J M Dieterich Date: Tue, 16 May 2017 23:11:22 -0400 Subject: [PATCH 22/64] Fix if/else structure. Thanks to TravisCI. --- config/bulldozer/make_defs.mk | 1 + config/carrizo/make_defs.mk | 1 + config/piledriver/make_defs.mk | 1 + 3 files changed, 3 insertions(+) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index b6fd06801..c6050504f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 76e74d67a..3e84b2011 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 81b3ca2f1..2160c0262 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -48,6 +48,7 @@ CC := gcc CC_VENDOR := gcc endif ifeq ($(CC_VENDOR),gcc) +else ifeq ($(CC_VENDOR),clang) else $(error gcc or clang are required for this configuration.) From 555ddc30d4c7e44f3f335e436c98606f56e1598b Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 17 May 2017 12:27:14 -0500 Subject: [PATCH 23/64] Remove shebangs from makefiles. --- Makefile | 1 - build/config.mk.in | 1 - build/gen-make-frags/fragment.mk | 1 - common.mk | 1 - config/armv7a/make_defs.mk | 1 - config/armv8a/make_defs.mk | 1 - config/bgq/make_defs.mk | 1 - config/bulldozer/make_defs.mk | 1 - config/carrizo/make_defs.mk | 1 - config/cortex-a15/make_defs.mk | 1 - config/cortex-a9/make_defs.mk | 1 - config/dunnington/make_defs.mk | 1 - config/emscripten/make_defs.mk | 1 - config/haswell/make_defs.mk | 1 - config/knl/make_defs.mk | 1 - config/loongson3a/make_defs.mk | 1 - config/mic/make_defs.mk | 1 - config/piledriver/make_defs.mk | 1 - config/pnacl/make_defs.mk | 1 - config/power7/make_defs.mk | 1 - config/reference/make_defs.mk | 1 - config/sandybridge/make_defs.mk | 1 - config/template/make_defs.mk | 1 - mpi_test/Makefile | 1 - test/Makefile | 1 - testsuite/Makefile | 1 - 26 files changed, 26 deletions(-) diff --git a/Makefile b/Makefile index 1a4868eaa..0ad4d5c78 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/config.mk.in b/build/config.mk.in index 9d92f7fb4..fb4be778d 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/fragment.mk b/build/gen-make-frags/fragment.mk index 08773302b..17394f40b 100644 --- a/build/gen-make-frags/fragment.mk +++ b/build/gen-make-frags/fragment.mk @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/common.mk b/common.mk index 683d0b0e9..6f496c5da 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index af114c379..82e5c8c79 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index be2e32667..a5a9b577e 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index dfc96dc6c..879124ffa 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index c6050504f..b6b47e13f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 3e84b2011..63edee464 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index d38f60304..94f3a6e0c 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index d38f60304..94f3a6e0c 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 4234a4657..13c8c0ead 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 63f4733cf..bc99caee8 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 5e2d32641..185bd918c 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 0db039eb7..1e35adda7 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 21f6b084a..9e44684f2 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 43d4a27ad..311936979 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 2160c0262..45c6393b1 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e080b5c51..5375117b3 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index c2cb7b1ca..9d51e9db4 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 02076e95c..580d42d39 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index cd916739d..c0eed3b24 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index c538a4c4d..8bd574d3b 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,3 @@ -#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/mpi_test/Makefile b/mpi_test/Makefile index 1bb965b4a..321b166d6 100644 --- a/mpi_test/Makefile +++ b/mpi_test/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/test/Makefile b/test/Makefile index 92b8c7df9..21bbf746d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/testsuite/Makefile b/testsuite/Makefile index 6a1954d8c..4ebece103 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -1,4 +1,3 @@ -#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like From ec5c0c0448275280dca0991f6f33afeb73650450 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Wed, 17 May 2017 12:29:44 -0500 Subject: [PATCH 24/64] Change to /bin/sh. All scripts checked with Debian's checkbashisms. Also check for clang first in auto-detect.sh. --- build/auto-detect/auto-detect.sh | 14 ++++++-------- build/bump-version.sh | 2 +- build/check-test.sh | 2 +- build/gen-make-frags/gen-make-frag.sh | 6 +----- build/mirror-tree.sh | 2 +- build/templates/license.sh | 2 +- build/update-version-file.sh | 2 +- 7 files changed, 12 insertions(+), 18 deletions(-) diff --git a/build/auto-detect/auto-detect.sh b/build/auto-detect/auto-detect.sh index 345fc2f3a..5185fd8af 100755 --- a/build/auto-detect/auto-detect.sh +++ b/build/auto-detect/auto-detect.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -41,7 +41,11 @@ main() { - CC=gcc + if [ clang -v > /dev/null 2>&1 ]; then + CC=clang + else + CC=gcc + fi CPUID_SRC=cpuid_x86.c CPUID_BIN=blis_cpu_detect ARCH=reference @@ -59,12 +63,6 @@ main() # of the distribution and the directory in which we are building. cur_dirpath="." - - OSNAME=`uname` - if [ $OSNAME = "Darwin" ]; then - CC=clang - fi - # # Detect architecture by predefined macros # diff --git a/build/bump-version.sh b/build/bump-version.sh index 6df894152..53cbe1825 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/check-test.sh b/build/check-test.sh index fa7b4779a..6fb082a4c 100755 --- a/build/check-test.sh +++ b/build/check-test.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index 77e6dd5c4..19fdc5bd0 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -374,10 +374,6 @@ gen_mkfiles() read_mkfile_config() { - local index lname - declare -i count - - # Read the file describing file suffixes. src_file_suffixes=$(cat "${suffix_file}") diff --git a/build/mirror-tree.sh b/build/mirror-tree.sh index 3aae9ce35..813091fcf 100755 --- a/build/mirror-tree.sh +++ b/build/mirror-tree.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/templates/license.sh b/build/templates/license.sh index 06da737b6..6105c1f04 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/update-version-file.sh b/build/update-version-file.sh index 38e8d2088..23373022d 100755 --- a/build/update-version-file.sh +++ b/build/update-version-file.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like From 6e04f9df01d79c1b0e673943ca0d5d0a6095eb2e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 17 May 2017 13:03:52 -0500 Subject: [PATCH 25/64] Restored deleted lines from makefile fragments. --- Makefile | 1 + build/config.mk.in | 1 + build/gen-make-frags/fragment.mk | 1 + common.mk | 1 + config/armv7a/make_defs.mk | 1 + config/armv8a/make_defs.mk | 1 + config/bgq/make_defs.mk | 1 + config/bulldozer/make_defs.mk | 1 + config/carrizo/make_defs.mk | 1 + config/cortex-a15/make_defs.mk | 1 + config/cortex-a9/make_defs.mk | 1 + config/dunnington/make_defs.mk | 1 + config/emscripten/make_defs.mk | 1 + config/haswell/make_defs.mk | 1 + config/knl/make_defs.mk | 1 + config/loongson3a/make_defs.mk | 1 + config/mic/make_defs.mk | 1 + config/piledriver/make_defs.mk | 1 + config/pnacl/make_defs.mk | 1 + config/power7/make_defs.mk | 1 + config/reference/make_defs.mk | 1 + config/sandybridge/make_defs.mk | 1 + config/template/make_defs.mk | 1 + mpi_test/Makefile | 1 + test/Makefile | 1 + testsuite/Makefile | 1 + 26 files changed, 26 insertions(+) diff --git a/Makefile b/Makefile index 0ad4d5c78..d74eba889 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/build/config.mk.in b/build/config.mk.in index fb4be778d..e7a3f3235 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/build/gen-make-frags/fragment.mk b/build/gen-make-frags/fragment.mk index 17394f40b..2a1eb6907 100644 --- a/build/gen-make-frags/fragment.mk +++ b/build/gen-make-frags/fragment.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/common.mk b/common.mk index 6f496c5da..08731d9aa 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 82e5c8c79..9d1b51d0a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index a5a9b577e..6d09af5cc 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 879124ffa..57c9899a0 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index b6b47e13f..0546a474f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index 63edee464..f52d1dd67 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 94f3a6e0c..053e11cbb 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 94f3a6e0c..053e11cbb 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 13c8c0ead..8d07f2177 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index bc99caee8..4353d65cf 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 185bd918c..8c739607a 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 1e35adda7..104abafe2 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 9e44684f2..8fd9fb65a 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 311936979..8e7738b44 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index 45c6393b1..b5c3f159c 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index 5375117b3..c6f629ef8 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 9d51e9db4..765344f79 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 580d42d39..f75b9ec55 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index c0eed3b24..d91df8b68 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 8bd574d3b..d98452553 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/mpi_test/Makefile b/mpi_test/Makefile index 321b166d6..2d2df10b7 100644 --- a/mpi_test/Makefile +++ b/mpi_test/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/test/Makefile b/test/Makefile index 21bbf746d..1472ae4b5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. diff --git a/testsuite/Makefile b/testsuite/Makefile index 4ebece103..acbdd7bf3 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -1,4 +1,5 @@ # +# # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. From 681eec913d7c2ebcff637cec5c1627ced9a92b99 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 12:28:09 -0500 Subject: [PATCH 26/64] Change PACKDIM_MR (double) for haswell to 8. --- config/haswell/bli_kernel.h | 1 + kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index ce18dc266..9ed530d68 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,6 +102,7 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 +#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..3679b5773 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) + "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) + "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From d87614af3f3d9187be94d6e77984b282bf890928 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:47:36 -0400 Subject: [PATCH 27/64] Revert "Change PACKDIM_MR (double) for haswell to 8." This reverts commit 681eec913d7c2ebcff637cec5c1627ced9a92b99. --- config/haswell/bli_kernel.h | 1 - kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 40 ++++++++++---------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 9ed530d68..ce18dc266 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -102,7 +102,6 @@ #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 -#define BLIS_PACKDIM_MR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 3679b5773..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,22 +734,22 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -761,22 +761,22 @@ void bli_dgemm_asm_6x8 " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" @@ -786,28 +786,28 @@ void bli_dgemm_asm_6x8 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 - "vbroadcastsd 24 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 25 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" - "vbroadcastsd 26 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 27 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" - "vbroadcastsd 28 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 29 * 8(%%rax), %%ymm3 \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr_packdim) + "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" @@ -855,7 +855,7 @@ void bli_dgemm_asm_6x8 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr_packdim) + "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" From 7f41bb0a0becde6a7de7df0f99668d7b4686c3b0 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 26 May 2017 14:49:31 -0400 Subject: [PATCH 28/64] PACKDIM_MR=8 didn't work out, but messing with the prefetching helps 2%. --- kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bee1df996..5bd2d92e5 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -734,6 +734,8 @@ void bli_dgemm_asm_6x8 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 + "prefetcht0 72 * 8(%%rax) \n\t" + " \n\t" "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" @@ -759,7 +761,7 @@ void bli_dgemm_asm_6x8 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" + "prefetcht0 80 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" From be2c7eb85168937bd4318f4d05ded37620119310 Mon Sep 17 00:00:00 2001 From: prangana Date: Tue, 30 May 2017 09:58:10 +0530 Subject: [PATCH 29/64] Update Licence File Change-Id: I4c5cf1690d0cef92a68400f9a89e454ab6856ad2 --- LICENSE | 14 +++------- build/templates/license.c | 50 ++++++++++++++++------------------ build/templates/license.h | 50 ++++++++++++++++------------------ build/templates/license.sh | 56 ++++++++++++++++++-------------------- 4 files changed, 79 insertions(+), 91 deletions(-) diff --git a/LICENSE b/LICENSE index 38017661d..5a5aa21d9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,11 +1,5 @@ -BLIS framework -License ---- - -The BLIS framework is licensed under the following license, typically -known as the "new" or "modified" or "3-clause" BSD license. - +Copyright (C) 2017, Advanced Micro Devices, Inc. Copyright (C) 2014, The University of Texas at Austin @@ -17,9 +11,9 @@ met: - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. + - Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT diff --git a/build/templates/license.c b/build/templates/license.c index c393608c4..956aee0fe 100644 --- a/build/templates/license.c +++ b/build/templates/license.c @@ -1,33 +1,31 @@ /* - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. +Copyright (C) 2017, Advanced Micro Devices, Inc. - Copyright (C) 2014, The University of Texas at Austin +Copyright (C) 2014, The University of Texas at Austin - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ diff --git a/build/templates/license.h b/build/templates/license.h index c393608c4..956aee0fe 100644 --- a/build/templates/license.h +++ b/build/templates/license.h @@ -1,33 +1,31 @@ /* - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. +Copyright (C) 2017, Advanced Micro Devices, Inc. - Copyright (C) 2014, The University of Texas at Austin +Copyright (C) 2014, The University of Texas at Austin - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ diff --git a/build/templates/license.sh b/build/templates/license.sh index 6105c1f04..3bf15c46c 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,33 +1,31 @@ #!/bin/sh # -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name of The University of Texas at Austin nor the names -# of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright (C) 2017, Advanced Micro Devices, Inc. + +# Copyright (C) 2014, The University of Texas at Austin + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # From 6d17e0120fe5c127b941136ad2c0c08e91439535 Mon Sep 17 00:00:00 2001 From: sthangar Date: Wed, 24 May 2017 11:48:16 +0530 Subject: [PATCH 30/64] Checked in the small matrix code to compute GEMM called with A transpose case Change-Id: I29f40046d43d7a4b037c1cb322503ee26495f462 --- frame/3/gemm/bli_gemm_front.c | 2 - kernels/x86_64/zen/3/bli_gemm_small_matrix.c | 900 +++++++++++++++++-- 2 files changed, 838 insertions(+), 64 deletions(-) diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 0c30b2f7b..d4b0bde6e 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -47,10 +47,8 @@ void bli_gemm_front ) { #ifdef BLIS_SMALL_MATRIX_ENABLE -#ifndef BLIS_ENABLE_MULTITHREADING gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl); if(BLIS_SUCCESS != status) -#endif #endif { obj_t a_local; diff --git a/kernels/x86_64/zen/3/bli_gemm_small_matrix.c b/kernels/x86_64/zen/3/bli_gemm_small_matrix.c index 29e8e6225..31431a51b 100644 --- a/kernels/x86_64/zen/3/bli_gemm_small_matrix.c +++ b/kernels/x86_64/zen/3/bli_gemm_small_matrix.c @@ -40,16 +40,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define D_MR (MR >> 1) #define NR 3 -//The scratch buffer allocated for performing transpose. -#define F_SCRATCH_DIM 1*1024 #define BLIS_ENABLE_PREFETCH -static float temp_scratch_buf[F_SCRATCH_DIM] __attribute__((aligned(64))); -static float A_pack[BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES] __attribute__((aligned(64))); +#define F_SCRATCH_DIM (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES) +static float A_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 ) #define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2) #define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2) -static double D_A_pack[D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES] __attribute__((aligned(64))); - +#define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) +static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); +#define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. +#define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR). static gint_t bli_sgemm_small_matrix ( obj_t* alpha, @@ -71,6 +71,28 @@ static gint_t bli_dgemm_small_matrix cntx_t* cntx, cntl_t* cntl ); + +static gint_t bli_sgemm_small_matrix_atbn +( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl +); + +static gint_t bli_dgemm_small_matrix_atbn +( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl +); /* * The bli_gemm_small_matrix function will use the * custom MRxNR kernels, to perform the computation. @@ -85,20 +107,17 @@ gint_t bli_gemm_small_matrix obj_t* c, cntx_t* cntx, cntl_t* cntl - ) +) { - +#ifdef BLIS_ENABLE_MULTITHREADING + return BLIS_NOT_YET_IMPLEMENTED; +#endif // If alpha is zero, scale by beta and return. if (bli_obj_equals(alpha, &BLIS_ZERO)) { return BLIS_NOT_YET_IMPLEMENTED; } - if (bli_obj_has_trans(*a)) - { - return BLIS_NOT_YET_IMPLEMENTED; - } - // if row major format return. if ((bli_obj_row_stride(*a) != 1) || (bli_obj_row_stride(*b) != 1) || @@ -106,13 +125,29 @@ gint_t bli_gemm_small_matrix { return BLIS_INVALID_ROW_STRIDE; } - // The custom kernels are implemented only for float datatype. + num_t dt = ((*c).info & (0x7 << 0)); + if (bli_obj_has_trans(*a)) + { + if (bli_obj_has_notrans(*b)) + { + if (dt == BLIS_FLOAT) + { + return bli_sgemm_small_matrix_atbn(alpha, a, b, beta, c, cntx, cntl); + } + else if (dt == BLIS_DOUBLE) + { + return bli_dgemm_small_matrix_atbn(alpha, a, b, beta, c, cntx, cntl); + } + } + + return BLIS_NOT_YET_IMPLEMENTED; + } + if (dt == BLIS_DOUBLE) { return bli_dgemm_small_matrix(alpha, a, b, beta, c, cntx, cntl); - } if (dt == BLIS_FLOAT) @@ -134,14 +169,14 @@ gint_t bli_sgemm_small_matrix obj_t* c, cntx_t* cntx, cntl_t* cntl - ) +) { int M = bli_obj_length(*c); // number of rows of Matrix C int N = bli_obj_width(*c); // number of columns of Matrix C int K = bli_obj_width(*a); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . - // printf("alpha_cast = %f beta_cast = %f [ Trans = %d %d], [stride = %d %d %d] [m,n,k = %d %d %d]\n",*alpha_cast,*beta_cast, bli_obj_has_trans(*a), bli_obj_has_trans(*b), lda, ldb,ldc, M,N,K); + // printf("alpha_cast = %f beta_cast = %f [ Trans = %d %d], [stride = %d %d %d] [m,n,k = %d %d %d]\n",*alpha_cast,*beta_cast, bli_obj_has_trans(*a), bli_obj_has_trans(*b), lda, ldb,ldc, M,N,K); if (((M * N) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) || ((M < BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < BLIS_SMALL_K_RECT_MATRIX_THRES))) { @@ -178,43 +213,16 @@ gint_t bli_sgemm_small_matrix if (N == 1) { bli_gemv - ( - alpha, - a, - b, - beta, - c - ); + ( + alpha, + a, + b, + beta, + c + ); return BLIS_SUCCESS; } - // When MAtrix A requires transpose perform it using - // scratch buffer also update the K dimension. - // This code needs optimization, and probably won't be - // used in the future. - if (bli_obj_has_trans(*a)) - { - K = bli_obj_length(*a); - - //if the scratch buffer cannot accomodate matrix A, return. - if ((M * K) > F_SCRATCH_DIM) - { - return BLIS_FAILURE; - } - - for (row_idx = 0; row_idx < M; row_idx += 1) - { - for (col_idx = 0; col_idx < K; col_idx += 1) - { - temp_scratch_buf[row_idx + col_idx * M] = - A[col_idx + row_idx * lda]; - } - } - A = temp_scratch_buf; - lda = M; - - } - //update the pointer math if matrix B needs to be transposed. if (bli_obj_has_trans(*b)) { @@ -222,7 +230,7 @@ gint_t bli_sgemm_small_matrix tb_inc_row = ldb; } - if (N <= 3) + if ((N <= 3) || ((MR * K) > F_SCRATCH_DIM)) { required_packing_A = 0; } @@ -290,7 +298,7 @@ gint_t bli_sgemm_small_matrix //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); _mm256_storeu_ps(tA_packed, ymm3); // the packing of matrix A - // ymm4 += ymm0 * ymm3; + // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); @@ -299,7 +307,7 @@ gint_t bli_sgemm_small_matrix ymm3 = _mm256_loadu_ps(tA + 8); _mm256_storeu_ps(tA_packed + 8, ymm3); // the packing of matrix A - // ymm5 += ymm0 * ymm3; + // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); @@ -308,7 +316,7 @@ gint_t bli_sgemm_small_matrix ymm3 = _mm256_loadu_ps(tA + 16); _mm256_storeu_ps(tA_packed + 16, ymm3); // the packing of matrix A - // ymm6 += ymm0 * ymm3; + // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); @@ -317,7 +325,7 @@ gint_t bli_sgemm_small_matrix ymm3 = _mm256_loadu_ps(tA + 24); _mm256_storeu_ps(tA_packed + 24, ymm3); // the packing of matrix A - // ymm7 += ymm0 * ymm3; + // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); @@ -1570,7 +1578,7 @@ static gint_t bli_dgemm_small_matrix int K = bli_obj_width(*a); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . // If alpha is zero, scale by beta and return. - // printf("alpha_cast = %f beta_cast = %f [ Trans = %d %d], [stride = %d %d %d] [m,n,k = %d %d %d]\n",*alpha_cast,*beta_cast, bli_obj_has_trans(*a), bli_obj_has_trans(*b), lda, ldb,ldc, M,N,K); + // printf("alpha_cast = %f beta_cast = %f [ Trans = %d %d], [stride = %d %d %d] [m,n,k = %d %d %d]\n",*alpha_cast,*beta_cast, bli_obj_has_trans(*a), bli_obj_has_trans(*b), lda, ldb,ldc, M,N,K); if (((M * N) < (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)) || ((M < D_BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < D_BLIS_SMALL_K_RECT_MATRIX_THRES))) { @@ -1624,7 +1632,7 @@ static gint_t bli_dgemm_small_matrix tb_inc_row = ldb; } - if (N <= 3) + if ((N <= 3) || ((D_MR * K) > D_SCRATCH_DIM)) { required_packing_A = 0; } @@ -1692,7 +1700,7 @@ static gint_t bli_dgemm_small_matrix //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); _mm256_storeu_pd(tA_packed, ymm3); // the packing of matrix A - // ymm4 += ymm0 * ymm3; + // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); @@ -1701,7 +1709,7 @@ static gint_t bli_dgemm_small_matrix ymm3 = _mm256_loadu_pd(tA + 4); _mm256_storeu_pd(tA_packed + 4, ymm3); // the packing of matrix A - // ymm5 += ymm0 * ymm3; + // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); @@ -1710,7 +1718,7 @@ static gint_t bli_dgemm_small_matrix ymm3 = _mm256_loadu_pd(tA + 8); _mm256_storeu_pd(tA_packed + 8, ymm3); // the packing of matrix A - // ymm6 += ymm0 * ymm3; + // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); @@ -1719,7 +1727,7 @@ static gint_t bli_dgemm_small_matrix ymm3 = _mm256_loadu_pd(tA + 12); _mm256_storeu_pd(tA_packed + 12, ymm3); // the packing of matrix A - // ymm7 += ymm0 * ymm3; + // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); @@ -2955,3 +2963,771 @@ static gint_t bli_dgemm_small_matrix }; + +gint_t bli_sgemm_small_matrix_atbn +( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl +) +{ + int M = bli_obj_length(*c); // number of rows of Matrix C + int N = bli_obj_width(*c); // number of columns of Matrix C + int K = bli_obj_length(*b); // number of rows of Matrix B + int lda = bli_obj_col_stride(*a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride(*b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc = bli_obj_col_stride(*c); // column stride of matrix C + int row_idx = 0, col_idx = 0, k; + float *A = a->buffer; // pointer to matrix A elements, stored in row major format + float *B = b->buffer; // pointer to matrix B elements, stored in column major format + float *C = c->buffer; // pointer to matrix C elements, stored in column major format + + float *tA = A, *tB = B, *tC = C; + + __m256 ymm4, ymm5, ymm6, ymm7; + __m256 ymm8, ymm9, ymm10, ymm11; + __m256 ymm12, ymm13, ymm14, ymm15; + __m256 ymm0, ymm1, ymm2, ymm3; + + float result, scratch[8]; + float *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + + // The non-copy version of the A^T GEMM gives better performance for the small M cases. + // The threshold is controlled by BLIS_ATBN_M_THRES + if (M <= BLIS_ATBN_M_THRES) + { + for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm5 = _mm256_setzero_ps(); + ymm6 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm8 = _mm256_setzero_ps(); + ymm9 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm11 = _mm256_setzero_ps(); + ymm12 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + ymm14 = _mm256_setzero_ps(); + ymm15 = _mm256_setzero_ps(); + + //The inner loop computes the 4x3 values of the matrix. + //The computation pattern is: + // ymm4 ymm5 ymm6 + // ymm7 ymm8 ymm9 + // ymm10 ymm11 ymm12 + // ymm13 ymm14 ymm15 + + //The Dot operation is performed in the inner loop, 8 float elements fit + //in the YMM register hence loop count incremented by 8 + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + ymm1 = _mm256_loadu_ps(tB + ldb); + ymm2 = _mm256_loadu_ps(tB + 2 * ldb); + + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); + + ymm3 = _mm256_loadu_ps(tA + lda); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + ymm3 = _mm256_loadu_ps(tA + 2 * lda); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_ps(tA + 3 * lda); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + tA += 8; + tB += 8; + + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; + ymm1 = _mm256_loadu_ps(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; + ymm2 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); + + } + + //horizontal addition and storage of the data. + //Results for 4x3 blocks of C is stored here + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + _mm256_storeu_ps(scratch, ymm7); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + _mm256_storeu_ps(scratch, ymm10); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + _mm256_storeu_ps(scratch, ymm13); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + tC += ldc; + ymm5 = _mm256_hadd_ps(ymm5, ymm5); + ymm5 = _mm256_hadd_ps(ymm5, ymm5); + _mm256_storeu_ps(scratch, ymm5); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm8 = _mm256_hadd_ps(ymm8, ymm8); + ymm8 = _mm256_hadd_ps(ymm8, ymm8); + _mm256_storeu_ps(scratch, ymm8); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm11 = _mm256_hadd_ps(ymm11, ymm11); + ymm11 = _mm256_hadd_ps(ymm11, ymm11); + _mm256_storeu_ps(scratch, ymm11); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm14 = _mm256_hadd_ps(ymm14, ymm14); + ymm14 = _mm256_hadd_ps(ymm14, ymm14); + _mm256_storeu_ps(scratch, ymm14); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + tC += ldc; + ymm6 = _mm256_hadd_ps(ymm6, ymm6); + ymm6 = _mm256_hadd_ps(ymm6, ymm6); + _mm256_storeu_ps(scratch, ymm6); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm9 = _mm256_hadd_ps(ymm9, ymm9); + ymm9 = _mm256_hadd_ps(ymm9, ymm9); + _mm256_storeu_ps(scratch, ymm9); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm12 = _mm256_hadd_ps(ymm12, ymm12); + ymm12 = _mm256_hadd_ps(ymm12, ymm12); + _mm256_storeu_ps(scratch, ymm12); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm15 = _mm256_hadd_ps(ymm15, ymm15); + ymm15 = _mm256_hadd_ps(ymm15, ymm15); + _mm256_storeu_ps(scratch, ymm15); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + } + } + + int processed_col = col_idx; + int processed_row = row_idx; + + //The edge case handling where N is not a multiple of 3 + if (processed_col < N) + { + for (col_idx = processed_col; col_idx < N; col_idx += 1) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + ymm7 = _mm256_setzero_ps(); + ymm10 = _mm256_setzero_ps(); + ymm13 = _mm256_setzero_ps(); + + //The inner loop computes the 4x1 values of the matrix. + //The computation pattern is: + // ymm4 + // ymm7 + // ymm10 + // ymm13 + + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_ps(tA + lda); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + + ymm3 = _mm256_loadu_ps(tA + 2 * lda); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + + ymm3 = _mm256_loadu_ps(tA + 3 * lda); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + tA += 8; + tB += 8; + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); + + } + + //horizontal addition and storage of the data. + //Results for 4x1 blocks of C is stored here + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + ymm7 = _mm256_hadd_ps(ymm7, ymm7); + _mm256_storeu_ps(scratch, ymm7); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + ymm10 = _mm256_hadd_ps(ymm10, ymm10); + _mm256_storeu_ps(scratch, ymm10); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + ymm13 = _mm256_hadd_ps(ymm13, ymm13); + _mm256_storeu_ps(scratch, ymm13); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + } + } + processed_row = row_idx; + } + + //The edge case handling where M is not a multiple of 4 + if (processed_row < M) + { + for (row_idx = processed_row; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_ps(); + + for (k = 0; (k + 7) < K; k += 8) + { + ymm0 = _mm256_loadu_ps(tB + 0); + ymm3 = _mm256_loadu_ps(tA); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + tA += 8; + tB += 8; + } + + // if K is not a multiple of 8, padding is done before load using temproary array. + if (k < K) + { + int iter; + float data_feeder[8] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_ps(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_ps(data_feeder); + ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); + + } + + //horizontal addition and storage of the data. + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + ymm4 = _mm256_hadd_ps(ymm4, ymm4); + _mm256_storeu_ps(scratch, ymm4); + result = scratch[0] + scratch[4]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; +} + +gint_t bli_dgemm_small_matrix_atbn +( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl +) +{ + int M = bli_obj_length(*c); // number of rows of Matrix C + int N = bli_obj_width(*c); // number of columns of Matrix C + int K = bli_obj_length(*b); // number of rows of Matrix B + int lda = bli_obj_col_stride(*a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. + int ldb = bli_obj_col_stride(*b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. + int ldc = bli_obj_col_stride(*c); // column stride of matrix C + int row_idx = 0, col_idx = 0, k; + double *A = a->buffer; // pointer to matrix A elements, stored in row major format + double *B = b->buffer; // pointer to matrix B elements, stored in column major format + double *C = c->buffer; // pointer to matrix C elements, stored in column major format + + double *tA = A, *tB = B, *tC = C; + + __m256d ymm4, ymm5, ymm6, ymm7; + __m256d ymm8, ymm9, ymm10, ymm11; + __m256d ymm12, ymm13, ymm14, ymm15; + __m256d ymm0, ymm1, ymm2, ymm3; + + double result, scratch[8]; + double *alpha_cast, *beta_cast; // alpha, beta multiples + alpha_cast = (alpha->buffer); + beta_cast = (beta->buffer); + + // The non-copy version of the A^T GEMM gives better performance for the small M cases. + // The threshold is controlled by BLIS_ATBN_M_THRES + if (M <= BLIS_ATBN_M_THRES) + { + for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm5 = _mm256_setzero_pd(); + ymm6 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm8 = _mm256_setzero_pd(); + ymm9 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm11 = _mm256_setzero_pd(); + ymm12 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + ymm14 = _mm256_setzero_pd(); + ymm15 = _mm256_setzero_pd(); + + //The inner loop computes the 4x3 values of the matrix. + //The computation pattern is: + // ymm4 ymm5 ymm6 + // ymm7 ymm8 ymm9 + // ymm10 ymm11 ymm12 + // ymm13 ymm14 ymm15 + + //The Dot operation is performed in the inner loop, 4 double elements fit + //in the YMM register hence loop count incremented by 4 + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + ymm1 = _mm256_loadu_pd(tB + ldb); + ymm2 = _mm256_loadu_pd(tB + 2 * ldb); + + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + + ymm3 = _mm256_loadu_pd(tA + lda); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + ymm3 = _mm256_loadu_pd(tA + 2 * lda); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + ymm3 = _mm256_loadu_pd(tA + 3 * lda); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + tA += 4; + tB += 4; + + } + + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; + ymm1 = _mm256_loadu_pd(data_feeder); + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; + ymm2 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); + ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); + ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); + ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); + ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); + + } + + //horizontal addition and storage of the data. + //Results for 4x3 blocks of C is stored here + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm7 = _mm256_hadd_pd(ymm7, ymm7); + _mm256_storeu_pd(scratch, ymm7); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm10 = _mm256_hadd_pd(ymm10, ymm10); + _mm256_storeu_pd(scratch, ymm10); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm13 = _mm256_hadd_pd(ymm13, ymm13); + _mm256_storeu_pd(scratch, ymm13); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + + tC += ldc; + ymm5 = _mm256_hadd_pd(ymm5, ymm5); + _mm256_storeu_pd(scratch, ymm5); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm8 = _mm256_hadd_pd(ymm8, ymm8); + _mm256_storeu_pd(scratch, ymm8); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm11 = _mm256_hadd_pd(ymm11, ymm11); + _mm256_storeu_pd(scratch, ymm11); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm14 = _mm256_hadd_pd(ymm14, ymm14); + _mm256_storeu_pd(scratch, ymm14); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + + tC += ldc; + ymm6 = _mm256_hadd_pd(ymm6, ymm6); + _mm256_storeu_pd(scratch, ymm6); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm9 = _mm256_hadd_pd(ymm9, ymm9); + _mm256_storeu_pd(scratch, ymm9); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm12 = _mm256_hadd_pd(ymm12, ymm12); + _mm256_storeu_pd(scratch, ymm12); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm15 = _mm256_hadd_pd(ymm15, ymm15); + _mm256_storeu_pd(scratch, ymm15); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + } + } + + int processed_col = col_idx; + int processed_row = row_idx; + + //The edge case handling where N is not a multiple of 3 + if (processed_col < N) + { + for (col_idx = processed_col; col_idx < N; col_idx += 1) + { + for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + ymm7 = _mm256_setzero_pd(); + ymm10 = _mm256_setzero_pd(); + ymm13 = _mm256_setzero_pd(); + + //The inner loop computes the 4x1 values of the matrix. + //The computation pattern is: + // ymm4 + // ymm7 + // ymm10 + // ymm13 + + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + ymm3 = _mm256_loadu_pd(tA + lda); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + ymm3 = _mm256_loadu_pd(tA + 2 * lda); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + + ymm3 = _mm256_loadu_pd(tA + 3 * lda); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + tA += 4; + tB += 4; + } + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); + + } + + //horizontal addition and storage of the data. + //Results for 4x1 blocks of C is stored here + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + ymm7 = _mm256_hadd_pd(ymm7, ymm7); + _mm256_storeu_pd(scratch, ymm7); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[1] = result + tC[1] * (*beta_cast); + + ymm10 = _mm256_hadd_pd(ymm10, ymm10); + _mm256_storeu_pd(scratch, ymm10); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[2] = result + tC[2] * (*beta_cast); + + ymm13 = _mm256_hadd_pd(ymm13, ymm13); + _mm256_storeu_pd(scratch, ymm13); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[3] = result + tC[3] * (*beta_cast); + + } + } + processed_row = row_idx; + } + + // The edge case handling where M is not a multiple of 4 + if (processed_row < M) + { + for (row_idx = processed_row; row_idx < M; row_idx += 1) + { + for (col_idx = 0; col_idx < N; col_idx += 1) + { + tA = A + row_idx * lda; + tB = B + col_idx * ldb; + tC = C + col_idx * ldc + row_idx; + // clear scratch registers. + ymm4 = _mm256_setzero_pd(); + + for (k = 0; (k + 3) < K; k += 4) + { + ymm0 = _mm256_loadu_pd(tB + 0); + ymm3 = _mm256_loadu_pd(tA); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + tA += 4; + tB += 4; + } + + // if K is not a multiple of 4, padding is done before load using temproary array. + if (k < K) + { + int iter; + double data_feeder[4] = { 0.0 }; + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; + ymm0 = _mm256_loadu_pd(data_feeder); + + for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; + ymm3 = _mm256_loadu_pd(data_feeder); + ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); + + } + + //horizontal addition and storage of the data. + ymm4 = _mm256_hadd_pd(ymm4, ymm4); + _mm256_storeu_pd(scratch, ymm4); + result = scratch[0] + scratch[2]; + result *= (*alpha_cast); + tC[0] = result + tC[0] * (*beta_cast); + + } + } + } + + return BLIS_SUCCESS; + } + else + return BLIS_NONCONFORMAL_DIMENSIONS; +} From 897bfa0e92082c30bbb74229562d7d7327cbbac8 Mon Sep 17 00:00:00 2001 From: prangana Date: Thu, 1 Jun 2017 16:11:09 +0530 Subject: [PATCH 31/64] Update version number Change-Id: Ib6e52d1d34c0791367ab9152dfab31f94deedeb4 --- version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version b/version index ee1372d33..b63ba696b 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.2 +0.9 From cf54c77bc79a0f33a514be72c80a654c4e6e6f63 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jun 2017 20:23:17 -0500 Subject: [PATCH 32/64] Add new SSI acknowledgment --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c40005221..6b0389bae 100644 --- a/README.md +++ b/README.md @@ -312,7 +312,7 @@ This project and its associated research was partially sponsored by grants from [Microsoft](http://www.microsoft.com/), [Intel](http://www.intel.com/), [Texas Instruments](http://www.ti.com/), and [AMD](http://www.amd.com/), as well as grants from the [National Science Foundation](http://www.nsf.gov/) (Awards -CCF-0917167 ACI-1148125/1340293, and CCF-1320112). +CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493). _Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of From 70cc825b552dec05165b9d70f9e6eb33d8abb118 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 6 Jun 2017 21:58:21 -0500 Subject: [PATCH 33/64] Update LICENSE Remove totally unnecessary first 9 lines and hopefully get Github to recognize it as 3BSD [ci skip]. --- LICENSE | 9 --------- 1 file changed, 9 deletions(-) diff --git a/LICENSE b/LICENSE index 38017661d..e3d83cd04 100644 --- a/LICENSE +++ b/LICENSE @@ -1,12 +1,3 @@ - -BLIS framework -License ---- - -The BLIS framework is licensed under the following license, typically -known as the "new" or "modified" or "3-clause" BSD license. - - Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without From 969b67e8800fbd5d14a086606f3b5afbf66ed093 Mon Sep 17 00:00:00 2001 From: Kiran Varaganti Date: Tue, 4 Jul 2017 12:57:32 +0530 Subject: [PATCH 34/64] Improved efficiency of dGEMM for large matrices by reducing TLB load misses and majorly L3 cache misses. This is achieved by changing the packed block sizes of matrix A & B. Now the optimum values are MC_D = 510 and KC_D = 1024. Change-Id: I2d8bdd5f62f2d1f8782ae2997f3d7a26587d1ca4 --- config/zen/bli_kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/zen/bli_kernel.h b/config/zen/bli_kernel.h index dbfcc826a..705a6f363 100644 --- a/config/zen/bli_kernel.h +++ b/config/zen/bli_kernel.h @@ -105,8 +105,8 @@ #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 -#define BLIS_DEFAULT_MC_D 72 -#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_MC_D 510 // 72 /* Improves performance for large Matrices */ +#define BLIS_DEFAULT_KC_D 1024 // 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 From 25ead66fb78557f73af48bac305724d5d8aa3309 Mon Sep 17 00:00:00 2001 From: sthangar Date: Fri, 30 Jun 2017 12:23:19 +0530 Subject: [PATCH 35/64] Reducing the framework overhead of GEMV routines Change-Id: I83607ad767bff74e305e915b54b0ea34ec3e5684 --- frame/2/bli_l2_cntx.c | 52 +++++++++++++++++++++- frame/2/gemv/bli_gemv_unb_var1.c | 6 ++- frame/2/gemv/bli_gemv_unb_var2.c | 6 ++- frame/2/gemv/bli_gemv_unf_var1.c | 11 ++++- frame/2/gemv/bli_gemv_unf_var2.c | 11 ++++- kernels/x86_64/zen/1f/bli_axpyf_opt_var1.c | 4 +- 6 files changed, 80 insertions(+), 10 deletions(-) diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index fdfe27a85..2207a0aec 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,6 +39,55 @@ // Define context initialization functions. // +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ +{ \ + /* Perform basic setup on the context. */ \ + bli_cntx_obj_create( cntx ); \ +\ + /* Initialize the context with kernels employed by the current + operation. */ \ + /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ + /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ + /* function pointers for fused kernels are queried from global context \ + hence it is not required to initialize in local context*/ \ + /*bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); */\ +\ + /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ + /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ + /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ + /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ +\ + /* packm-related kernels are not required for GEMV. */ \ + /* Initialize the context with packm-related kernels. */ \ + /*bli_packm_cntx_init( dt, cntx ); */\ +\ + /* block params for fused kernels are queried from global context \ + hence it is not required to initialize in local context*/ \ + /* Set the register and cache blocksizes and multiples, as well + as the execution method. */ \ + /*bli_gks_cntx_set_blkszs( BLIS_NAT, 4, \ + BLIS_N2, BLIS_N2, \ + BLIS_M2, BLIS_M2, \ + BLIS_AF, BLIS_AF, \ + BLIS_DF, BLIS_DF, \ + cntx ); */\ +} \ +void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ +{ \ + /* Free the context and all memory allocated to it. */ \ + bli_cntx_obj_free( cntx ); \ +} + +GENFRONT( gemv ) + #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -79,8 +129,6 @@ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ /* Free the context and all memory allocated to it. */ \ bli_cntx_obj_free( cntx ); \ } - -GENFRONT( gemv ) GENFRONT( trmv ) GENFRONT( trsv ) diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c index 4b0c85a21..e3e9ab027 100644 --- a/frame/2/gemv/bli_gemv_unb_var1.c +++ b/frame/2/gemv/bli_gemv_unb_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -70,7 +71,10 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxv_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ + func_t func;\ + bli_gks_get_l1v_ker(BLIS_DOTXV_KER,&func);\ + kfp_dv = func.ptr[dt];\ + /*kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx );*/ \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c index f14fc1bd6..a12340821 100644 --- a/frame/2/gemv/bli_gemv_unb_var2.c +++ b/frame/2/gemv/bli_gemv_unb_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -98,7 +99,10 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyv_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ - kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ + func_t func;\ + bli_gks_get_l1v_ker(BLIS_AXPYV_KER,&func);\ + kfp_av = func.ptr[dt];\ + /*kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx );*/ \ \ for ( i = 0; i < n_iter; ++i ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index 87481ad3c..b378fbb1b 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -71,8 +72,14 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,dotxf_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ + func_t func;\ + bli_gks_get_l1f_ker(BLIS_DOTXF_KER,&func);\ + kfp_df = func.ptr[dt];\ + blksz_t blksz;\ + bli_gks_get_blksz( BLIS_DF, &blksz );\ + b_fuse = blksz.v[dt];\ + /*kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx );*/\ + /*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx );*/\ \ for ( i = 0; i < n_iter; i += f ) \ { \ diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index 9228aabaa..2a8b1af52 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -98,8 +99,14 @@ void PASTEMAC(ch,varname) \ PASTECH(ch,axpyf_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ + func_t func;\ + bli_gks_get_l1f_ker(BLIS_AXPYF_KER,&func);\ + kfp_af = func.ptr[dt];\ + blksz_t blksz;\ + bli_gks_get_blksz( BLIS_AF, &blksz );\ + b_fuse = blksz.v[dt];\ + /*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); */\ + /*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \ \ for ( i = 0; i < n_iter; i += f ) \ { \ diff --git a/kernels/x86_64/zen/1f/bli_axpyf_opt_var1.c b/kernels/x86_64/zen/1f/bli_axpyf_opt_var1.c index 3432e536b..a03ad84a0 100644 --- a/kernels/x86_64/zen/1f/bli_axpyf_opt_var1.c +++ b/kernels/x86_64/zen/1f/bli_axpyf_opt_var1.c @@ -115,7 +115,7 @@ void bli_saxpyf_int_var1 bool_t use_ref = FALSE; - if ( ( b_n != fusefac) || inca != 1 || incx != 1 || incy != 1 ) + if ( ( b_n != fusefac) || inca != 1 || incy != 1 ) use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref == TRUE ) @@ -307,7 +307,7 @@ void bli_daxpyf_int_var1 bool_t use_ref = FALSE; - if ( ( b_n < fusefac) || inca != 1 || incx != 1 || incy != 1 ) + if ( ( b_n < fusefac) || inca != 1 || incy != 1 ) use_ref = TRUE; // Call the reference implementation if needed. //use_ref = TRUE; From ba7cada51a238d320528e3504ed0f0a17a6b022a Mon Sep 17 00:00:00 2001 From: Minh Quan HO Date: Fri, 7 Jul 2017 10:52:05 +0200 Subject: [PATCH 36/64] set missing free_fp in bli_membrk_init for free-ing GEN_USE buffers The membrk's free_fp is called when releasing GEN_USE buffers, but this free_fp is not set in bli_membrk_init --- frame/base/bli_membrk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c index 33a998de1..210c04be1 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_membrk.c @@ -44,6 +44,7 @@ void bli_membrk_init bli_mutex_init( bli_membrk_mutex( membrk ) ); bli_membrk_init_pools( cntx, membrk ); bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); + bli_membrk_set_free_fp( bli_free_pool, membrk ); } void bli_membrk_finalize From 8772a0b33a90154c80d88b381dcdd66f824e041f Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Thu, 13 Jul 2017 21:39:24 -0700 Subject: [PATCH 37/64] Fix Emscripten builds --- config/emscripten/make_defs.mk | 1 + frame/include/bli_system.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 4353d65cf..8797f9332 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -58,6 +58,7 @@ CVECFLAGS := # --- Determine the archiver and related flags --- AR := emar +RANLIB := emranlib ARFLAGS := cru # --- Determine the linker and related flags --- diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 05139136b..99a63d550 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -66,6 +66,8 @@ #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) #define BLIS_OS_BSD 1 +#elif defined(EMSCRIPTEN) +#define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif From 0e58ba1b3aa84700ca51a96f1c0eed6067562fba Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Mon, 17 Jul 2017 19:03:22 -0500 Subject: [PATCH 38/64] Added API to set mt environment variables. Details: - Renamed bli_env_get_nway() -> bli_thread_get_env(). - Added bli_thread_set_env() to allow setting environment variables pertaining to multithreading, such as BLIS_JC_NT or BLIS_NUM_THREADS. - Added the following convenience wrapper routines: bli_thread_get_jc_nt() bli_thread_get_ic_nt() bli_thread_get_jr_nt() bli_thread_get_ir_nt() bli_thread_get_num_threads() bli_thread_set_jc_nt() bli_thread_set_ic_nt() bli_thread_set_jr_nt() bli_thread_set_ir_nt() bli_thread_set_num_threads() - Added #include "errno.h" to bli_system.h. - This commit addresses issue #140. - Thanks to Chris Goodyer for inspiring these updates. --- frame/base/bli_cntx.c | 12 ++-- frame/include/bli_system.h | 1 + frame/thread/bli_thread.c | 109 ++++++++++++++++++++++++++++++++++--- frame/thread/bli_thread.h | 24 +++++++- 4 files changed, 130 insertions(+), 16 deletions(-) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 673987bfd..29529924c 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -756,10 +756,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, #ifdef BLIS_ENABLE_MULTITHREADING - int nthread = bli_env_read_nway( "BLIS_NUM_THREADS", -1 ); + int nthread = bli_thread_get_env( "BLIS_NUM_THREADS", -1 ); if ( nthread == -1 ) - nthread = bli_env_read_nway( "OMP_NUM_THREADS", -1 ); + nthread = bli_thread_get_env( "OMP_NUM_THREADS", -1 ); if ( nthread < 1 ) nthread = 1; @@ -786,10 +786,10 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, pc = 1; - dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); - dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); - dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); - dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + dim_t jc_env = bli_thread_get_env( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_thread_get_env( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_thread_get_env( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_thread_get_env( "BLIS_IR_NT", -1 ); if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) { diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 05139136b..b841ff447 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -41,6 +41,7 @@ #include #include #include +#include // Determine if we are on a 64-bit or 32-bit architecture #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 37ec94292..1dde88206 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1156,19 +1156,112 @@ void bli_partition_2x2( dim_t nthread, dim_t work1, dim_t work2, // ----------------------------------------------------------------------------- -// Some utilities -dim_t bli_env_read_nway( const char* env, dim_t fallback ) +dim_t bli_thread_get_env( const char* env, dim_t fallback ) { - dim_t num = fallback; - char* str = getenv( env ); + dim_t r_val; + char* str; + // Query the environment variable and store the result in str. + str = getenv( env ); + + // Set the return value based on the string obtained from getenv(). if ( str != NULL ) - { - num = strtol( str, NULL, 10 ); - } - return num; + { + // If there was no error, convert the string to an integer and + // prepare to return that integer. + r_val = strtol( str, NULL, 10 ); + } + else + { + // If there was an error, use the "fallback" as the return value. + r_val = fallback; + } + + return r_val; } +dim_t bli_thread_get_jc_nt( void ) +{ + return bli_thread_get_env( "BLIS_JC_NT", 1 ); +} + +dim_t bli_thread_get_ic_nt( void ) +{ + return bli_thread_get_env( "BLIS_IC_NT", 1 ); +} + +dim_t bli_thread_get_jr_nt( void ) +{ + return bli_thread_get_env( "BLIS_JR_NT", 1 ); +} + +dim_t bli_thread_get_ir_nt( void ) +{ + return bli_thread_get_env( "BLIS_IR_NT", 1 ); +} + +dim_t bli_thread_get_num_threads( void ) +{ + return bli_thread_get_env( "BLIS_NUM_THREADS", 1 ); +} + +void bli_thread_set_env( const char* env, dim_t value ) +{ + dim_t r_val; + char value_str[32]; + const char* fs_32 = "%u"; + const char* fs_64 = "%lu"; + + // Convert the string to an integer, but vary the format specifier + // depending on the integer type size. + if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value ); + else sprintf( value_str, fs_64, value ); + + // Set the environment variable using the string we just wrote to via + // sprintf(). (The 'TRUE' argument means we want to overwrite the current + // value if the environment variable already exists.) + r_val = setenv( env, value_str, TRUE ); + + // Check the return value in case something went horribly wrong. + if ( r_val == -1 ) + { + char err_str[128]; + + // Query the human-readable error string corresponding to errno. + strerror_r( errno, err_str, 128 ); + + // Print the error message. + bli_print_msg( err_str, __FILE__, __LINE__ ); + } +} + +void bli_thread_set_jc_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JC_NT", value ); +} + +void bli_thread_set_ic_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IC_NT", value ); +} + +void bli_thread_set_jr_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_JR_NT", value ); +} + +void bli_thread_set_ir_nt( dim_t value ) +{ + bli_thread_set_env( "BLIS_IR_NT", value ); +} + +void bli_thread_set_num_threads( dim_t value ) +{ + bli_thread_set_env( "BLIS_NUM_THREADS", value ); +} + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ) { while ( y != 0 ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 1998253cf..9092bc84d 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -164,6 +164,8 @@ void bli_l3_thread_decorator cntl_t* cntl ); +// ----------------------------------------------------------------------------- + // Factorization and partitioning prototypes typedef struct { @@ -178,8 +180,26 @@ dim_t bli_next_prime_factor(bli_prime_factors_t* factors); void bli_partition_2x2(dim_t nthread, dim_t work1, dim_t work2, dim_t* nt1, dim_t* nt2); -// Miscellaneous prototypes -dim_t bli_env_read_nway( const char* env, dim_t fallback ); +// ----------------------------------------------------------------------------- + +dim_t bli_thread_get_env( const char* env, dim_t fallback ); + +dim_t bli_thread_get_jc_nt( void ); +dim_t bli_thread_get_ic_nt( void ); +dim_t bli_thread_get_jr_nt( void ); +dim_t bli_thread_get_ir_nt( void ); +dim_t bli_thread_get_num_threads( void ); + +void bli_thread_set_env( const char* env, dim_t value ); + +void bli_thread_set_jc_nt( dim_t value ); +void bli_thread_set_ic_nt( dim_t value ); +void bli_thread_set_jr_nt( dim_t value ); +void bli_thread_set_ir_nt( dim_t value ); +void bli_thread_set_num_threads( dim_t value ); + +// ----------------------------------------------------------------------------- + dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); From 13175c5fb70fb6a378d5fff6ecede62e5ea6a1f6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 18 Jul 2017 17:56:00 -0500 Subject: [PATCH 39/64] Updated openmp/pthread barriers with GNU atomics. Details: - Updated the non-tree openmp and pthreads barriers defined in bli_thrcomm_openmp.c and bli_thrcomm_pthreads.c to instead call a common implementation in bli_thrcomm.c, bli_thrcomm_barrier_atomic(). This new implementation goes through the same motions as the previous codes, but protects its loads and increments with GNU atomic built-ins. These atomic statements take memory ordering parameters that allow us to specify just enough constraints for the barrier to work as intended on weakly-ordered hardware. The prior implementation was only guaranteed to work on systems with strongly- ordered memory. (Thanks to Devin Matthews for suggesting this change and his crash-course in atomics and memory ordering.) - Removed 'volatile' from structs' barrier field declarations in bli_thrcomm_*.h. - Updated bli_thrcomm_pthread.? files to use renamed struct barrier fields consistent with that of the _openmp.? files. - Updated other bli_thrcomm_* files to rename "communicator" variables to simply "comm". --- frame/thread/bli_thrcomm.c | 56 +++++++++++++++++--- frame/thread/bli_thrcomm.h | 12 +++-- frame/thread/bli_thrcomm_openmp.c | 65 ++++++++++++----------- frame/thread/bli_thrcomm_openmp.h | 9 ++-- frame/thread/bli_thrcomm_pthreads.c | 81 +++++++++++++++-------------- frame/thread/bli_thrcomm_pthreads.h | 11 ++-- frame/thread/bli_thrcomm_single.c | 26 ++++----- 7 files changed, 157 insertions(+), 103 deletions(-) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index f45827efd..dac705cfa 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -36,19 +36,63 @@ void* bli_thrcomm_bcast ( - thrcomm_t* communicator, + thrcomm_t* comm, dim_t id, void* to_send ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return to_send; + if ( comm == NULL || comm->n_threads == 1 ) return to_send; - if ( id == 0 ) communicator->sent_object = to_send; + if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( communicator, id ); - void* object = communicator->sent_object; - bli_thrcomm_barrier( communicator, id ); + bli_thrcomm_barrier( comm, id ); + void* object = comm->sent_object; + bli_thrcomm_barrier( comm, id ); return object; } +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) +{ + // Return early if the comm is NULL or if there is only one + // thread participating. + if ( comm == NULL || comm->n_threads == 1 ) return; + + // Read the "sense" variable. This variable is akin to a unique ID for + // the current barrier. The first n-1 threads will spin on this variable + // until it changes. The sense variable gets incremented by the last + // thread to enter the barrier, just before it exits. But it turns out + // that you don't need many unique IDs before you can wrap around. In + // fact, if everything else is working, a binary variable is sufficient, + // which is what we do here (i.e., 0 is incremented to 1, which is then + // decremented back to 0, and so forth). + bool_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED ); + + // Register ourselves (the current thread) as having arrived by + // incrementing the barrier_threads_arrived variable. We must perform + // this increment (and a subsequent read) atomically. + dim_t my_threads_arrived = + __atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL ); + + // If the current thread was the last thread to have arrived, then + // it will take actions that effectively ends and resets the barrier. + if ( my_threads_arrived == comm->n_threads ) + { + // Reset the variable tracking the number of threads that have arrived + // to zero (which returns the barrier to the "empty" state. Then + // atomically toggle the barrier sense variable. This will signal to + // the other threads (which are spinning in the branch elow) that it + // is now safe to exit the barrier. + comm->barrier_threads_arrived = 0; + __atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); + } + else + { + // If the current thread is NOT the last thread to have arrived, then + // it spins on the sense variable until that sense variable changes at + // which time these threads will exit the barrier. + while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense ) + ; // Empty loop body. + } +} + diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 593f8d7fa..59fbc6576 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -49,11 +49,13 @@ // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); -void bli_thrcomm_free( thrcomm_t* communicator ); -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ); -void bli_thrcomm_cleanup( thrcomm_t* communicator ); -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t thread_id ); -void* bli_thrcomm_bcast( thrcomm_t* communicator, dim_t inside_id, void* to_send ); +void bli_thrcomm_free( thrcomm_t* comm ); +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ); +void bli_thrcomm_cleanup( thrcomm_t* comm ); +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id ); +void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send ); + +void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 0882d1659..5777c5b6d 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -44,63 +44,66 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifndef BLIS_TREE_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } //'Normal' barrier for openmp //barrier routine taken from art of multicore programming -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if( communicator == NULL || communicator->n_threads == 1 ) +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; - bool_t my_sense = communicator->barrier_sense; + bool_t my_sense = comm->barrier_sense; dim_t my_threads_arrived; _Pragma( "omp atomic capture" ) - my_threads_arrived = ++(communicator->barrier_threads_arrived); + my_threads_arrived = ++(comm->barrier_threads_arrived); - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->barrier_threads_arrived = 0; - communicator->barrier_sense = !communicator->barrier_sense; + comm->barrier_threads_arrived = 0; + comm->barrier_sense = !comm->barrier_sense; } else { - volatile bool_t* listener = &communicator->barrier_sense; + volatile bool_t* listener = &comm->barrier_sense; while ( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); - bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, communicator->barriers, 0 ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads ); + bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } //Tree barrier used for Intel Xeon Phi @@ -145,14 +148,14 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_ return me; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - for ( dim_t i = 0; i < communicator->n_threads; i++ ) + if ( comm == NULL ) return; + for ( dim_t i = 0; i < comm->n_threads; i++ ) { - bli_thrcomm_tree_barrier_free( communicator->barriers[i] ); + bli_thrcomm_tree_barrier_free( comm->barriers[i] ); } - bli_free_intl( communicator->barriers ); + bli_free_intl( comm->barriers ); } void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) diff --git a/frame/thread/bli_thrcomm_openmp.h b/frame/thread/bli_thrcomm_openmp.h index 6808b9772..435845b16 100644 --- a/frame/thread/bli_thrcomm_openmp.h +++ b/frame/thread/bli_thrcomm_openmp.h @@ -60,11 +60,12 @@ struct thrcomm_s #else struct thrcomm_s { - void* sent_object; - dim_t n_threads; + void* sent_object; + dim_t n_threads; - volatile bool_t barrier_sense; - dim_t barrier_threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 230b63905..27fb37e6a 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -43,81 +43,84 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } #ifdef BLIS_USE_PTHREAD_BARRIER -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - pthread_barrier_init( &communicator->barrier, NULL, n_threads ); + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + pthread_barrier_init( &comm->barrier, NULL, n_threads ); } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - pthread_barrier_destroy( &communicator->barrier ); + if ( comm == NULL ) return; + pthread_barrier_destroy( &comm->barrier ); } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - pthread_barrier_wait( &communicator->barrier ); + pthread_barrier_wait( &comm->barrier ); } #else -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) { - if ( communicator == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->sense = 0; - communicator->threads_arrived = 0; + if ( comm == NULL ) return; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_init( &communicator->mutex, NULL ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_init( &comm->mutex, NULL ); +//#endif } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { -#ifdef BLIS_USE_PTHREAD_MUTEX - if ( communicator == NULL ) return; - pthread_mutex_destroy( &communicator->mutex ); -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// if ( comm == NULL ) return; +// pthread_mutex_destroy( &comm->mutex ); +//#endif } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { - if ( communicator == NULL || communicator->n_threads == 1 ) return; - bool_t my_sense = communicator->sense; +#if 0 + if ( comm == NULL || comm->n_threads == 1 ) return; + bool_t my_sense = comm->sense; dim_t my_threads_arrived; #ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_lock( &communicator->mutex ); - my_threads_arrived = ++(communicator->threads_arrived); - pthread_mutex_unlock( &communicator->mutex ); + pthread_mutex_lock( &comm->mutex ); + my_threads_arrived = ++(comm->threads_arrived); + pthread_mutex_unlock( &comm->mutex ); #else - my_threads_arrived = __sync_add_and_fetch(&(communicator->threads_arrived), 1); + my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1); #endif - if ( my_threads_arrived == communicator->n_threads ) + if ( my_threads_arrived == comm->n_threads ) { - communicator->threads_arrived = 0; - communicator->sense = !communicator->sense; + comm->threads_arrived = 0; + comm->sense = !comm->sense; } else { - volatile bool_t* listener = &communicator->sense; + volatile bool_t* listener = &comm->sense; while( *listener == my_sense ) {} } +#endif + bli_thrcomm_barrier_atomic( comm, t_id ); } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.h b/frame/thread/bli_thrcomm_pthreads.h index 1c807772d..286387bcf 100644 --- a/frame/thread/bli_thrcomm_pthreads.h +++ b/frame/thread/bli_thrcomm_pthreads.h @@ -54,12 +54,13 @@ struct thrcomm_s void* sent_object; dim_t n_threads; -#ifdef BLIS_USE_PTHREAD_MUTEX - pthread_mutex_t mutex; -#endif +//#ifdef BLIS_USE_PTHREAD_MUTEX +// pthread_mutex_t mutex; +//#endif - volatile bool_t sense; - volatile dim_t threads_arrived; + //volatile bool_t barrier_sense; + bool_t barrier_sense; + dim_t barrier_threads_arrived; }; #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index c038f59a0..76b48ca95 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -44,29 +44,29 @@ thrcomm_t* bli_thrcomm_create( dim_t n_threads ) return comm; } -void bli_thrcomm_free( thrcomm_t* communicator ) +void bli_thrcomm_free( thrcomm_t* comm ) { - if ( communicator == NULL ) return; - bli_thrcomm_cleanup( communicator ); - bli_free_intl( communicator ); + if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); + bli_free_intl( comm ); } -void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads ) +void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; - communicator->sent_object = NULL; - communicator->n_threads = n_threads; - communicator->barrier_sense = 0; - communicator->barrier_threads_arrived = 0; + comm->sent_object = NULL; + comm->n_threads = n_threads; + comm->barrier_sense = 0; + comm->barrier_threads_arrived = 0; } -void bli_thrcomm_cleanup( thrcomm_t* communicator ) +void bli_thrcomm_cleanup( thrcomm_t* comm ) { - if ( communicator == NULL ) return; + if ( comm == NULL ) return; } -void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) +void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) { return; } From 5caaba2d61cbbc36d63102a0786ece28ff797f72 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Jul 2017 13:51:53 -0500 Subject: [PATCH 40/64] Added --force-version=STRING option to configure. Details: - Added an option to configure that allows the user to force an arbitrary version string at configure-time. The help text also now describes the usage information. - Changed the way the version string is communicated to the Makefile. Previously, it was read into the VERSION variable from the 'version' file via $(shell cat ...). Now, the VERSION variable is instead set in config.mk (via a configure-substituted anchor from config.mk.in). --- Makefile | 4 ---- build/config.mk.in | 4 ++++ configure | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index d74eba889..6a1bab97f 100644 --- a/Makefile +++ b/Makefile @@ -85,9 +85,6 @@ TESTSUITE_CONF_GEN := input.general TESTSUITE_CONF_OPS := input.operations TESTSUITE_OUT_FILE := output.testsuite -# The name of the file where the version string is stored. -VERSION_FILE := version - # The name of the "special" directories, which contain source code that # use non-standard compiler flags. NOOPT_DIR := noopt @@ -141,7 +138,6 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME) # Construct the architecture-version string, which will be used to name the # library upon installation. -VERSION := $(shell cat $(DIST_PATH)/$(VERSION_FILE)) VERS_CONF := $(VERSION)-$(CONFIG_NAME) # --- Library names --- diff --git a/build/config.mk.in b/build/config.mk.in index e7a3f3235..ef2ccfc70 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -36,6 +36,10 @@ ifndef CONFIG_MK_INCLUDED CONFIG_MK_INCLUDED := yes +# The version string. This could be the official string or a custom +# string forced at configure-time. +VERSION := @version@ + # The name of the configuration sub-directory. CONFIG_NAME := @config_name@ diff --git a/configure b/configure index 7aabc5b78..9edfaa98b 100755 --- a/configure +++ b/configure @@ -123,6 +123,12 @@ print_usage() echo " compatibility layer. This automatically enables the" echo " BLAS compatibility layer as well." echo " " + echo " --force-version=STRING" + echo " " + echo " Force configure to use an arbitrary version string" + echo " STRING. This option may be useful when repackaging" + echo " custom versions of BLIS by outside organizations." + echo " " echo " -h, --help Output this information and quit." echo " " echo " Environment Variables:" @@ -232,6 +238,7 @@ main() blas2blis_int_type_size=32 enable_blas2blis='yes' enable_cblas='no' + force_version='no' # The path to the auto-detection script. auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh" @@ -247,14 +254,6 @@ main() dummy_file='_blis_dir_detect.tmp' - # Check whether we need to update the version file. - ${update_version_file_sh} -o "${script_name}" "${version_filepath}" - - - # Query which version of BLIS this is. - version=$(cat ${version_filepath}) - - # Process our command line options. while getopts ":hp:d:t:qi:b:-:" opt; do case $opt in @@ -323,6 +322,9 @@ main() disable-cblas) enable_cblas='no' ;; + force-version=*) + force_version=${OPTARG#*=} + ;; *) print_usage ;; @@ -375,10 +377,27 @@ main() done + # Check whether we need to update the version file. + ${update_version_file_sh} -o "${script_name}" "${version_filepath}" + + + # Query which version of BLIS this is. + version=$(cat ${version_filepath}) + + # Initial message. echo "${script_name}: starting configuration of BLIS ${version}." + # Check if the user requested a custom version string. + if [ "x${force_version}" = "xno" ]; then + echo "${script_name}: configuring with official version string." + else + echo "${script_name}: configuring with custom version string '${force_version}'." + version="${force_version}" + fi + + # Set config_name based on the number of arguments leftover (after command # line option processing). if [ $# = "0" ]; then @@ -574,6 +593,7 @@ main() # to config_mk_out. echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}" cat "${config_mk_in_path}" \ + | sed "s/@version@/${version}/g" \ | sed "s/@config_name@/${config_name}/g" \ | sed "s/@dist_path@/${dist_path_esc}/g" \ | sed "s/@CC@/${cc_esc}/g" \ From 1f1ec0db9380b87679d5c771c4594daa1cfc5f0d Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 19 Jul 2017 15:40:48 -0500 Subject: [PATCH 41/64] Updated ar option list used by all configurations. Details: - Dropped 'u' from the list of modifiers passed into the library archiver ar. Previously, "cru" was used, while now we employ only "cr". This change was prompted by a warning observed on Ubuntu 16.04: ar: `u' modifier ignored since `D' is the default (see `U') This caused me to realize that the default mode causes timestamps to be zero, and thus the 'u' option, which causes only changed object files to be inserted, is not applicable. --- config/armv7a/make_defs.mk | 2 +- config/armv8a/make_defs.mk | 2 +- config/bgq/make_defs.mk | 2 +- config/bulldozer/make_defs.mk | 2 +- config/carrizo/make_defs.mk | 2 +- config/cortex-a15/make_defs.mk | 2 +- config/cortex-a9/make_defs.mk | 2 +- config/dunnington/make_defs.mk | 2 +- config/emscripten/make_defs.mk | 2 +- config/haswell/make_defs.mk | 2 +- config/knl/make_defs.mk | 2 +- config/loongson3a/make_defs.mk | 2 +- config/mic/make_defs.mk | 2 +- config/piledriver/make_defs.mk | 2 +- config/power7/make_defs.mk | 2 +- config/reference/make_defs.mk | 2 +- config/sandybridge/make_defs.mk | 2 +- config/template/make_defs.mk | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 9d1b51d0a..8539e1d29 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 6d09af5cc..56dd3074e 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 57c9899a0..07f6792db 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -59,7 +59,7 @@ CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunr # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 0546a474f..582354e96 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index f52d1dd67..94808d466 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index 053e11cbb..c4c47467e 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 8d07f2177..eec2f5a56 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 4353d65cf..814603e0b 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -58,7 +58,7 @@ CVECFLAGS := # --- Determine the archiver and related flags --- AR := emar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 8c739607a..4a4e6e494 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index 104abafe2..a3db40981 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -94,7 +94,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index 8fd9fb65a..89ca32929 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 8e7738b44..e82811357 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index b5c3f159c..93cd1f2c8 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -76,7 +76,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index 765344f79..f35ffdfff 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index f75b9ec55..89bcca269 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -81,7 +81,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index d91df8b68..7bf48d2a4 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -83,7 +83,7 @@ endif # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index d98452553..e563d9308 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -72,7 +72,7 @@ CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- AR := ar -ARFLAGS := cru +ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) From 8823f91a14638ce6f4e45e67df03212bb61609d6 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 10:04:34 -0500 Subject: [PATCH 42/64] Add fallbacks to __sync_* or __c11_atomic_* builtins when __atomic_* is not supported. Fixes #143. --- frame/thread/bli_thrcomm.c | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index dac705cfa..5e7f21f42 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,6 +52,46 @@ void* bli_thrcomm_bcast return object; } +// Swap out __atomic_* builtins for __sync_* builtins for: +// - BG/Q +// - gcc <4.7 (including icc through gcc compatibility layer) +// - clang without c11 atomic builtins +#if defined(__bgq__) || \ + (defined(__GNUC__) && (__GNUC__ < 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ < 7))) || \ + (defined(__clang__) && !__has_extension(c_atomic)) + +#define __ATOMIC_RELAXED +#define __ATOMIC_ACQUIRE +#define __ATOMIC_RELEASE +#define __ATOMIC_ACQ_REL + +#define __atomic_load_n(ptr, constraint) \ + __sync_fetch_and_add(ptr, 0) +#define __atomic_add_fetch(ptr, value, constraint) \ + __sync_add_and_fetch(ptr, value) +#define __atomic_fetch_add(ptr, value, constraint) \ + __sync_fetch_and_add(ptr, value) +#define __atomic_fetch_xor(ptr, value, constraint) \ + __sync_fetch_and_xor(ptr, value) + +#endif + +// Swap out __atomic_* builtins for _c11_atomic_* builtins for +// - clang with c11 atomic builtins +#if defined(__clang__) && __has_extension(c_atomic) + +#define __atomic_load_n(ptr, constraint) \ + __c11_atomic_load(ptr, constraint) +#define __atomic_add_fetch(ptr, value, constraint) \ + (__c11_fetch_add(ptr, value, constraint) + value) +#define __atomic_fetch_add(ptr, value, constraint) \ + __c11_fetch_add(ptr, value, constraint) +#define __atomic_fetch_xor(ptr, value, constraint) \ + __c11_fetch_xor(ptr, value, constraint) + +#endif + void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) { // Return early if the comm is NULL or if there is only one From 7425d0744d9e9cd29a887120e57c2b43ba287040 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 12:54:58 -0500 Subject: [PATCH 43/64] Add default #define for __has_extension. --- frame/thread/bli_thrcomm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 5e7f21f42..b50218a77 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,6 +52,10 @@ void* bli_thrcomm_bcast return object; } +#ifndef __has_extension +#define __has_extension(x) 0 +#endif + // Swap out __atomic_* builtins for __sync_* builtins for: // - BG/Q // - gcc <4.7 (including icc through gcc compatibility layer) From 733faf848dcc54834fcdfbb0185dc644978d8864 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Thu, 20 Jul 2017 14:50:13 -0500 Subject: [PATCH 44/64] Clang can't make up it's mind what to support. --- frame/thread/bli_thrcomm.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index b50218a77..a06f49523 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -52,18 +52,8 @@ void* bli_thrcomm_bcast return object; } -#ifndef __has_extension -#define __has_extension(x) 0 -#endif - -// Swap out __atomic_* builtins for __sync_* builtins for: -// - BG/Q -// - gcc <4.7 (including icc through gcc compatibility layer) -// - clang without c11 atomic builtins -#if defined(__bgq__) || \ - (defined(__GNUC__) && (__GNUC__ < 4 || \ - (__GNUC__ == 4 && __GNUC_MINOR__ < 7))) || \ - (defined(__clang__) && !__has_extension(c_atomic)) +// Use __sync_* builtins (assumed available) if __atomic_* ones are not present. +#ifndef __ATOMIC_RELAXED #define __ATOMIC_RELAXED #define __ATOMIC_ACQUIRE @@ -81,21 +71,6 @@ void* bli_thrcomm_bcast #endif -// Swap out __atomic_* builtins for _c11_atomic_* builtins for -// - clang with c11 atomic builtins -#if defined(__clang__) && __has_extension(c_atomic) - -#define __atomic_load_n(ptr, constraint) \ - __c11_atomic_load(ptr, constraint) -#define __atomic_add_fetch(ptr, value, constraint) \ - (__c11_fetch_add(ptr, value, constraint) + value) -#define __atomic_fetch_add(ptr, value, constraint) \ - __c11_fetch_add(ptr, value, constraint) -#define __atomic_fetch_xor(ptr, value, constraint) \ - __c11_fetch_xor(ptr, value, constraint) - -#endif - void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) { // Return early if the comm is NULL or if there is only one From c63980f4ca750618f359031d0691289b1abf5146 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 29 Jul 2017 14:53:39 -0500 Subject: [PATCH 45/64] Moved 'family' field from cntx_t to cntl_t. Details: - Removed the family field inside the cntx_t struct and re-added it to the cntl_t struct. Updated all accessor functions/macros accordingly, as well as all consumers and intermediaries of the family parameter (such as bli_l3_thread_decorator(), bli_l3_direct(), and bli_l3_prune_*()). This change was motivated by the desire to keep the context limited, as much as possible, to information about the computing environment. (The family field, by contrast, is a descriptor about the operation being executed.) - Added additional functions to bli_blksz_*() API. - Added additional functions to bli_cntx_*() API. - Minor updates to bli_func.c, bli_mbool.c. - Removed 'obj' from bli_blksz_*() API names. - Removed 'obj' from bli_cntx_*() API names. - Removed 'obj' from bli_cntl_*(), bli_*_cntl_*() API names. Renamed routines that operate only on a single struct to contain the "_node" suffix to differentiate with those routines that operate on the entire tree. - Added enums for packm and unpackm kernels to bli_type_defs.h. - Removed BLIS_1F and BLIS_VF from bszid_t definition in bli_type_defs.h. They weren't being used and probably never will be. --- frame/1/bli_l1v_cntx.c | 16 +- frame/1/other/packv/bli_packv_cntl.c | 4 +- frame/1/other/scalv/bli_scalv_cntl.c | 2 +- frame/1/other/unpackv/bli_unpackv_cntl.c | 2 +- frame/1d/bli_l1d_cntx.c | 4 +- frame/1f/bli_l1f_cntx.c | 16 +- frame/1m/bli_l1m_cntx.c | 8 +- frame/1m/packm/bli_packm_cntl.c | 5 +- frame/1m/packm/bli_packm_cntl.h | 2 +- frame/1m/packm/bli_packm_cntx.c | 4 +- frame/1m/scalm/bli_scalm_cntl.c | 5 +- frame/1m/scalm/bli_scalm_cntl.h | 2 +- frame/1m/unpackm/bli_unpackm_cntl.c | 5 +- frame/1m/unpackm/bli_unpackm_cntl.h | 2 +- frame/2/bli_l2_cntx.c | 16 +- frame/2/gemv/other/bli_gemv_cntl.c | 16 +- frame/2/ger/other/bli_ger_cntl.c | 16 +- frame/2/hemv/other/bli_hemv_cntl.c | 8 +- frame/2/her/other/bli_her_cntl.c | 8 +- frame/2/her2/other/bli_her2_cntl.c | 8 +- frame/2/trmv/other/bli_trmv_cntl.c | 8 +- frame/2/trsv/other/bli_trsv_cntl.c | 8 +- frame/3/bli_l3_blocksize.c | 5 +- frame/3/bli_l3_blocksize.h | 14 +- frame/3/bli_l3_cntl.c | 11 +- frame/3/bli_l3_cntl.h | 3 +- frame/3/bli_l3_cntx.c | 4 +- frame/3/bli_l3_direct.c | 4 +- frame/3/bli_l3_direct.h | 2 +- frame/3/bli_l3_prune.c | 8 +- frame/3/bli_l3_prune.h | 2 +- frame/3/gemm/bli_gemm_blk_var1.c | 4 +- frame/3/gemm/bli_gemm_blk_var2.c | 4 +- frame/3/gemm/bli_gemm_blk_var3.c | 8 +- frame/3/gemm/bli_gemm_cntl.c | 43 +- frame/3/gemm/bli_gemm_cntl.h | 3 +- frame/3/gemm/bli_gemm_front.c | 4 +- frame/3/hemm/bli_hemm_front.c | 4 +- frame/3/her2k/bli_her2k_front.c | 5 +- frame/3/herk/bli_herk_front.c | 4 +- frame/3/symm/bli_symm_front.c | 4 +- frame/3/syr2k/bli_syr2k_front.c | 5 +- frame/3/syrk/bli_syrk_front.c | 4 +- frame/3/trmm/bli_trmm_front.c | 4 +- frame/3/trmm3/bli_trmm3_front.c | 4 +- frame/3/trsm/bli_trsm_blk_var1.c | 4 +- frame/3/trsm/bli_trsm_blk_var2.c | 4 +- frame/3/trsm/bli_trsm_blk_var3.c | 4 +- frame/3/trsm/bli_trsm_cntl.c | 47 +- frame/3/trsm/bli_trsm_cntl.h | 3 +- frame/3/trsm/bli_trsm_front.c | 4 +- frame/3/trsm/old/bli_trsm_cntl.c | 46 +- frame/3/trsm/old/bli_trsm_cntl.h | 2 +- frame/base/bli_blksz.c | 77 +++- frame/base/bli_blksz.h | 41 +- frame/base/bli_cntl.c | 37 +- frame/base/bli_cntl.h | 23 +- frame/base/bli_cntx.c | 557 +++++++++++++++++------ frame/base/bli_cntx.h | 78 ++-- frame/base/bli_func.c | 62 ++- frame/base/bli_func.h | 31 +- frame/base/bli_gks.c | 6 - frame/base/bli_mbool.c | 39 +- frame/base/bli_mbool.h | 26 +- frame/include/bli_type_defs.h | 85 +++- frame/ind/cntx/bli_gemmind_cntx.c | 16 +- frame/ind/cntx/bli_trsmind_cntx.c | 6 +- frame/thread/bli_thrcomm_openmp.c | 5 +- frame/thread/bli_thrcomm_pthreads.c | 9 +- frame/thread/bli_thrcomm_single.c | 5 +- frame/thread/bli_thread.c | 28 +- frame/thread/bli_thread.h | 1 + testsuite/src/test_libblis.c | 2 +- 73 files changed, 1065 insertions(+), 501 deletions(-) diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index 149c20320..243a3d062 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addv, BLIS_ADDV_KER ) @@ -70,7 +70,7 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -84,7 +84,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) @@ -95,7 +95,7 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -106,7 +106,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyv, BLIS_AXPYV_KER, addv ) @@ -118,7 +118,7 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ @@ -130,7 +130,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( scal2v, BLIS_SCAL2V_KER, setv, copyv ) diff --git a/frame/1/other/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c index 13f90a429..b81a6e5d1 100644 --- a/frame/1/other/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -47,7 +47,7 @@ void bli_packv_cntl_init( void ) void bli_packv_cntl_finalize( void ) { - bli_cntl_obj_free( packv_cntl ); + bli_cntl_free_node( packv_cntl ); } packv_t* bli_packv_cntl_obj_create( impl_t impl_type, @@ -105,7 +105,7 @@ cntl_t* bli_packv_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( BLIS_NO_PART, var_func, diff --git a/frame/1/other/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c index 9edb6162c..c75977fa1 100644 --- a/frame/1/other/scalv/bli_scalv_cntl.c +++ b/frame/1/other/scalv/bli_scalv_cntl.c @@ -44,7 +44,7 @@ void bli_scalv_cntl_init() void bli_scalv_cntl_finalize() { - bli_cntl_obj_free( scalv_cntl ); + bli_cntl_free_node( scalv_cntl ); } diff --git a/frame/1/other/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c index 1e1ab93fb..52858fc0b 100644 --- a/frame/1/other/unpackv/bli_unpackv_cntl.c +++ b/frame/1/other/unpackv/bli_unpackv_cntl.c @@ -44,7 +44,7 @@ void bli_unpackv_cntl_init() void bli_unpackv_cntl_finalize() { - bli_cntl_obj_free( unpackv_cntl ); + bli_cntl_free_node( unpackv_cntl ); } unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type, diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index 443dc20f7..f22631a5d 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addd, addv ) diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 58ca4a07c..8e786f2ed 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -54,7 +54,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) @@ -65,7 +65,7 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) @@ -88,7 +88,7 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -105,7 +105,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) @@ -116,7 +116,7 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -135,7 +135,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ } \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( dotxf, BLIS_DOTXF_KER, dotv, dotxv ) diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 7eb3dcd4c..d7ede7c91 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -43,7 +43,7 @@ \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname,_cntx_init)( dt, cntx ); \ @@ -51,7 +51,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( addm, addv ) @@ -66,7 +66,7 @@ GENFRONT( subm, subv ) \ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ @@ -75,7 +75,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( copym, copyv, setv ) diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 67b01fffb..6effbb522 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, @@ -69,8 +69,9 @@ cntl_t* bli_packm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 057a512ed..ab22e8621 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -80,7 +80,7 @@ typedef struct packm_params_s packm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_packm_cntl_obj_create +cntl_t* bli_packm_cntl_create_node ( void* var_func, void* packm_var_func, diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 2f4e0b030..75fa24d67 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -41,7 +41,7 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { - bli_cntx_obj_create( cntx ); + bli_cntx_create( cntx ); // Initialize the context with kernels that may be needed for the // current operation. @@ -57,5 +57,5 @@ void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) void bli_packm_cntx_finalize( cntx_t* cntx ) { - bli_cntx_obj_free( cntx ); + bli_cntx_free( cntx ); } diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index f6008a9a3..24c12bc9e 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node @@ -46,8 +46,9 @@ cntl_t* bli_scalm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, NULL, diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index 4029a4f10..d6160dca8 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -33,7 +33,7 @@ */ -cntl_t* bli_scalm_cntl_obj_create +cntl_t* bli_scalm_cntl_create_node ( void* var_func, cntl_t* sub_node diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 2900cb3b8..852b0c81e 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,7 +34,7 @@ #include "blis.h" -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, @@ -55,8 +55,9 @@ cntl_t* bli_unpackm_cntl_obj_create // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. - cntl = bli_cntl_obj_create + cntl = bli_cntl_create_node ( + BLIS_NOID, BLIS_NO_PART, var_func, params, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 82d9727fc..96278d406 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -45,7 +45,7 @@ typedef struct unpackm_params_s unpackm_params_t; // ----------------------------------------------------------------------------- -cntl_t* bli_unpackm_cntl_obj_create +cntl_t* bli_unpackm_cntl_create_node ( void* var_func, void* unpackm_var_func, diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index fdfe27a85..df6e9441f 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -44,7 +44,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -77,7 +77,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( gemv ) @@ -91,7 +91,7 @@ GENFRONT( trsv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -111,7 +111,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( ger ) @@ -125,7 +125,7 @@ GENFRONT( syr ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -163,7 +163,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( hemv ) @@ -176,7 +176,7 @@ GENFRONT( symv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ @@ -198,7 +198,7 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } GENFRONT( her2 ) diff --git a/frame/2/gemv/other/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c index ecedeaca4..4ccba4ff0 100644 --- a/frame/2/gemv/other/bli_gemv_cntl.c +++ b/frame/2/gemv/other/bli_gemv_cntl.c @@ -152,17 +152,17 @@ void bli_gemv_cntl_init() void bli_gemv_cntl_finalize() { - bli_cntl_obj_free( gemv_cntl_bs_ke_dot ); - bli_cntl_obj_free( gemv_cntl_bs_ke_axpy ); + bli_cntl_free_node( gemv_cntl_bs_ke_dot ); + bli_cntl_free_node( gemv_cntl_bs_ke_axpy ); - bli_cntl_obj_free( gemv_cntl_rp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_rp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_rp_bs_dot ); + bli_cntl_free_node( gemv_cntl_rp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_cp_bs_dot ); - bli_cntl_obj_free( gemv_cntl_cp_bs_axpy ); + bli_cntl_free_node( gemv_cntl_cp_bs_dot ); + bli_cntl_free_node( gemv_cntl_cp_bs_axpy ); - bli_cntl_obj_free( gemv_cntl_ge_dot ); - bli_cntl_obj_free( gemv_cntl_ge_axpy ); + bli_cntl_free_node( gemv_cntl_ge_dot ); + bli_cntl_free_node( gemv_cntl_ge_axpy ); } diff --git a/frame/2/ger/other/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c index 16565ef02..6e35b5f6f 100644 --- a/frame/2/ger/other/bli_ger_cntl.c +++ b/frame/2/ger/other/bli_ger_cntl.c @@ -145,17 +145,17 @@ void bli_ger_cntl_init() void bli_ger_cntl_finalize() { - bli_cntl_obj_free( ger_cntl_bs_ke_row ); - bli_cntl_obj_free( ger_cntl_bs_ke_col ); + bli_cntl_free_node( ger_cntl_bs_ke_row ); + bli_cntl_free_node( ger_cntl_bs_ke_col ); - bli_cntl_obj_free( ger_cntl_rp_bs_row ); - bli_cntl_obj_free( ger_cntl_rp_bs_col ); + bli_cntl_free_node( ger_cntl_rp_bs_row ); + bli_cntl_free_node( ger_cntl_rp_bs_col ); - bli_cntl_obj_free( ger_cntl_cp_bs_row ); - bli_cntl_obj_free( ger_cntl_cp_bs_col ); + bli_cntl_free_node( ger_cntl_cp_bs_row ); + bli_cntl_free_node( ger_cntl_cp_bs_col ); - bli_cntl_obj_free( ger_cntl_ge_row ); - bli_cntl_obj_free( ger_cntl_ge_col ); + bli_cntl_free_node( ger_cntl_ge_row ); + bli_cntl_free_node( ger_cntl_ge_col ); } diff --git a/frame/2/hemv/other/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c index 8505f615c..4bed7b012 100644 --- a/frame/2/hemv/other/bli_hemv_cntl.c +++ b/frame/2/hemv/other/bli_hemv_cntl.c @@ -108,10 +108,10 @@ void bli_hemv_cntl_init() void bli_hemv_cntl_finalize() { - bli_cntl_obj_free( hemv_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( hemv_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( hemv_cntl_ge_lcol_urow ); + bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( hemv_cntl_ge_lrow_ucol ); + bli_cntl_free_node( hemv_cntl_ge_lcol_urow ); } diff --git a/frame/2/her/other/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c index 932306c21..28ed63f12 100644 --- a/frame/2/her/other/bli_her_cntl.c +++ b/frame/2/her/other/bli_her_cntl.c @@ -97,10 +97,10 @@ void bli_her_cntl_init() void bli_her_cntl_finalize() { - bli_cntl_obj_free( her_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her_cntl_ge_lcol_urow ); + bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her_cntl_ge_lcol_urow ); } diff --git a/frame/2/her2/other/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c index 4a0f5d0f8..199e74c3c 100644 --- a/frame/2/her2/other/bli_her2_cntl.c +++ b/frame/2/her2/other/bli_her2_cntl.c @@ -101,10 +101,10 @@ void bli_her2_cntl_init() void bli_her2_cntl_finalize() { - bli_cntl_obj_free( her2_cntl_bs_ke_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_bs_ke_lcol_urow ); - bli_cntl_obj_free( her2_cntl_ge_lrow_ucol ); - bli_cntl_obj_free( her2_cntl_ge_lcol_urow ); + bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol ); + bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow ); + bli_cntl_free_node( her2_cntl_ge_lrow_ucol ); + bli_cntl_free_node( her2_cntl_ge_lcol_urow ); } diff --git a/frame/2/trmv/other/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c index 5fbf872aa..fff406365 100644 --- a/frame/2/trmv/other/bli_trmv_cntl.c +++ b/frame/2/trmv/other/bli_trmv_cntl.c @@ -98,10 +98,10 @@ void bli_trmv_cntl_init() void bli_trmv_cntl_finalize() { - bli_cntl_obj_free( trmv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trmv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trmv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trmv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trmv_cntl_ge_ncol_trow ); } diff --git a/frame/2/trsv/other/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c index 71de48d3c..9eedb5a9f 100644 --- a/frame/2/trsv/other/bli_trsv_cntl.c +++ b/frame/2/trsv/other/bli_trsv_cntl.c @@ -101,10 +101,10 @@ void bli_trsv_cntl_init() void bli_trsv_cntl_finalize() { - bli_cntl_obj_free( trsv_cntl_bs_ke_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_bs_ke_ncol_trow ); - bli_cntl_obj_free( trsv_cntl_ge_nrow_tcol ); - bli_cntl_obj_free( trsv_cntl_ge_ncol_trow ); + bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow ); + bli_cntl_free_node( trsv_cntl_ge_nrow_tcol ); + bli_cntl_free_node( trsv_cntl_ge_ncol_trow ); } diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 630cf03a5..d25f5f924 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -43,10 +43,11 @@ dim_t bli_l3_determine_kc obj_t* a, obj_t* b, bszid_t bszid, - cntx_t* cntx + cntx_t* cntx, + cntl_t* cntl ) { - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 8f9f7ad80..02250efc0 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -32,6 +32,18 @@ */ +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx, + cntl_t* cntl + ); + #undef GENPROT #define GENPROT( opname ) \ @@ -47,8 +59,6 @@ dim_t PASTEMAC0(opname) \ cntx_t* cntx \ ); -GENPROT( l3_determine_kc ) - GENPROT( gemm_determine_kc ) GENPROT( herk_determine_kc ) GENPROT( trmm_determine_kc ) diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 4fe3fe7f5..db821b811 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -37,10 +37,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ) @@ -49,8 +49,6 @@ void bli_l3_cntl_create_if // tree as a function of the operation family. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); - if ( family == BLIS_GEMM || family == BLIS_HERK || family == BLIS_TRMM ) @@ -73,6 +71,10 @@ void bli_l3_cntl_create_if // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); + + // Recursively set the family fields of the newly copied control tree + // nodes. + bli_cntl_mark_family( family, *cntl_use ); } } @@ -81,7 +83,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread @@ -91,7 +92,7 @@ void bli_l3_cntl_free_if // been created, so we now must free it. if ( cntl_orig == NULL ) { - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || family == BLIS_HERK || diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index dc0aeb869..3bdd8b43f 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -39,10 +39,10 @@ void bli_l3_cntl_create_if ( + opid_t family, obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t** cntl_use ); @@ -52,7 +52,6 @@ void bli_l3_cntl_free_if obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx, cntl_t* cntl_orig, cntl_t* cntl_use, thrinfo_t* thread diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 161e68160..a8441fa79 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -41,7 +41,7 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -76,7 +76,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c index 993501541..f1c661007 100644 --- a/frame/3/bli_l3_direct.c +++ b/frame/3/bli_l3_direct.c @@ -39,11 +39,11 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h index 7b88ba51f..021dfde74 100644 --- a/frame/3/bli_l3_direct.h +++ b/frame/3/bli_l3_direct.h @@ -37,7 +37,7 @@ dir_t bli_l3_direct obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ); // ----------------------------------------------------------------------------- diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index f908bbb64..a14c543d8 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -40,11 +40,11 @@ void bli_l3_prune_unref_mparts_m obj_t* a, obj_t* b, obj_t* c, - cntx_t* cntx + cntl_t* cntl ) { // Query the operation family. - opid_t family = bli_cntx_family( cntx ); + opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); @@ -61,11 +61,11 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ) \ { \ /* Query the operation family. */ \ - opid_t family = bli_cntx_family( cntx ); \ + opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index 13d661ff1..6905e21f4 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \ obj_t* a, \ obj_t* b, \ obj_t* c, \ - cntx_t* cntx \ + cntl_t* cntl \ ); GENPROT( m ) diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c index 1a5693d8c..8fc062da2 100644 --- a/frame/3/gemm/bli_gemm_blk_var1.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c index a65f8a20a..ff2a570db 100644 --- a/frame/3/gemm/bli_gemm_blk_var2.c +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 0148428df..64ab573da 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -53,10 +53,10 @@ void bli_gemm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); @@ -66,7 +66,7 @@ void bli_gemm_blk_var3 { // Determine the current algorithmic blocksize. b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); + bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, @@ -109,7 +109,7 @@ void bli_gemm_blk_var3 // row-panel of C, and thus beta is applied to all of C exactly once. // Thus, for neither trmm nor trmm3 should we reset the scalar on C // after the first iteration. - if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( bli_cntl_family( cntl ) != BLIS_TRMM ) if ( i == 0 ) bli_obj_scalar_reset( c ); } } diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 775ca2544..b17ce10ac 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -56,22 +56,24 @@ cntl_t* bli_gemmbp_cntl_create else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -86,15 +88,16 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var1, gemm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -109,16 +112,18 @@ cntl_t* bli_gemmbp_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var2, gemm_cntl_mm_op @@ -141,15 +146,17 @@ cntl_t* bli_gemmpb_cntl_create //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. - cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_ub_ke @@ -157,7 +164,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix A (which is really the right-hand // operand "B"). - cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, @@ -172,8 +179,9 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the n dimension by MC. - cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node ( + family, BLIS_MC, bli_gemm_blk_var2, gemm_cntl_packb @@ -181,7 +189,7 @@ cntl_t* bli_gemmpb_cntl_create // Create a node for packing matrix B (which is really the left-hand // operand "A"). - cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, @@ -196,16 +204,18 @@ cntl_t* bli_gemmpb_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packa ); // Create a node for partitioning the m dimension by NC. - cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + family, BLIS_NC, bli_gemm_blk_var1, gemm_cntl_mm_op @@ -227,13 +237,14 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 6da6cd768..3b643e1fc 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -59,8 +59,9 @@ void bli_gemm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_gemm_cntl_obj_create +cntl_t* bli_gemm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index acceabbe8..f737edf81 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -82,9 +82,6 @@ void bli_gemm_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -95,6 +92,7 @@ void bli_gemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 340aa7edc..8d7f8d635 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -89,9 +89,6 @@ void bli_hemm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -102,6 +99,7 @@ void bli_hemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index c6851d2a4..e203d59ba 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -107,9 +107,6 @@ void bli_her2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -122,6 +119,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bh_local, @@ -134,6 +132,7 @@ void bli_her2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id &alpha_conj, &b_local, &ah_local, diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 642be0d99..227b97d5d 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -87,9 +87,6 @@ void bli_herk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -100,6 +97,7 @@ void bli_herk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &ah_local, diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 57aa11f73..a01ed15cf 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -88,9 +88,6 @@ void bli_symm_front bli_obj_swap( a_local, b_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -101,6 +98,7 @@ void bli_symm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index f64a765e5..459cdbdd0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -88,9 +88,6 @@ void bli_syr2k_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -103,6 +100,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &bt_local, @@ -115,6 +113,7 @@ void bli_syr2k_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &b_local, &at_local, diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 42d135659..eba91cfd9 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -81,9 +81,6 @@ void bli_syrk_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_HERK, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -94,6 +91,7 @@ void bli_syrk_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_HERK, // operation family id alpha, &a_local, &at_local, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index b44ddfcff..75549e2d0 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -131,9 +131,6 @@ void bli_trmm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, bli_obj_length( c_local ), @@ -144,6 +141,7 @@ void bli_trmm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index e672f7af3..f89b6ad96 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -130,9 +130,6 @@ void bli_trmm3_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, bli_obj_length( c_local ), @@ -143,6 +140,7 @@ void bli_trmm3_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_TRMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index a731d8265..67b046952 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var1 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_mdim diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c index a133f0bb0..48e4b4f1c 100644 --- a/frame/3/trsm/bli_trsm_blk_var2.c +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var2 dim_t my_start, my_end; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_get_range_ndim diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 7b428c8ef..d4e809c50 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -53,10 +53,10 @@ void bli_trsm_blk_var3 dim_t k_trans; // Determine the direction in which to partition (forwards or backwards). - direct = bli_l3_direct( a, b, c, cntx ); + direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. - bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. k_trans = bli_obj_width_after_trans( *a ); diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 78bd5eeb9..e05fc3d20 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,23 +50,27 @@ cntl_t* bli_trsm_l_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -81,15 +85,16 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -104,16 +109,18 @@ cntl_t* bli_trsm_l_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -129,23 +136,27 @@ cntl_t* bli_trsm_r_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; + const opid_t family = BLIS_TRSM; + // Create two nodes for the macro-kernel. - cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); - cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. - cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( bli_trsm_packa, bli_packm_blk_var1, @@ -160,15 +171,16 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the m dimension by MC. - cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. - cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( bli_trsm_packb, bli_packm_blk_var1, @@ -183,16 +195,18 @@ cntl_t* bli_trsm_r_cntl_create ); // Create a node for partitioning the k dimension by KC. - cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. - cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op @@ -212,13 +226,14 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 6dbe9adce..cfd20cad3 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -55,8 +55,9 @@ void bli_trsm_cntl_free // ----------------------------------------------------------------------------- -cntl_t* bli_trsm_cntl_obj_create +cntl_t* bli_trsm_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 42bda8a51..47cff8b48 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -122,9 +122,6 @@ void bli_trsm_front bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_TRSM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, bli_obj_length( c_local ), @@ -135,6 +132,7 @@ void bli_trsm_front bli_l3_thread_decorator ( bli_trsm_int, + BLIS_TRSM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c index 3a83faafd..de018d64a 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.c +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -64,7 +64,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (left side). trsm_l_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases @@ -78,7 +78,7 @@ void bli_trsm_cntl_init() trsm_l_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // IMPORTANT: m dim multiple must be mr since // B_pack is updated (ie: serves as C) in trsm @@ -93,7 +93,7 @@ void bli_trsm_cntl_init() // Create control tree objects for packm operations (right side). trsm_r_packa_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_NR, BLIS_MR, @@ -105,7 +105,7 @@ void bli_trsm_cntl_init() trsm_r_packb_cntl = - bli_packm_cntl_obj_create( BLIS_BLOCKED, + bli_packm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, // pack panels of B compactly BLIS_MR, BLIS_MR, @@ -119,7 +119,7 @@ void bli_trsm_cntl_init() // Create control tree object for lowest-level block-panel kernel. trsm_cntl_bp_ke = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + bli_trsm_cntl_create_node( BLIS_UNB_OPT, BLIS_VARIANT2, 0, // bszid_t not used by macro-kernel NULL, NULL, NULL, NULL, @@ -129,7 +129,7 @@ void bli_trsm_cntl_init() // problem (left side). trsm_l_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -144,7 +144,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (left side). trsm_l_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -159,7 +159,7 @@ void bli_trsm_cntl_init() // general problems (left side). trsm_l_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -174,7 +174,7 @@ void bli_trsm_cntl_init() // problem (right side). trsm_r_cntl_op_bp = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_MC, NULL, @@ -189,7 +189,7 @@ void bli_trsm_cntl_init() // rank-k (outer panel) updates (right side). trsm_r_cntl_mm_op = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT3, BLIS_KC, NULL, @@ -204,7 +204,7 @@ void bli_trsm_cntl_init() // general problems (right side). trsm_r_cntl_vl_mm = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, + bli_trsm_cntl_create_node( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_NC, NULL, @@ -222,22 +222,22 @@ void bli_trsm_cntl_init() void bli_trsm_cntl_finalize() { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + bli_cntl_free_node( trsm_l_packa_cntl ); + bli_cntl_free_node( trsm_l_packb_cntl ); + bli_cntl_free_node( trsm_r_packa_cntl ); + bli_cntl_free_node( trsm_r_packb_cntl ); - bli_cntl_obj_free( trsm_cntl_bp_ke ); + bli_cntl_free_node( trsm_cntl_bp_ke ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + bli_cntl_free_node( trsm_l_cntl_op_bp ); + bli_cntl_free_node( trsm_l_cntl_mm_op ); + bli_cntl_free_node( trsm_l_cntl_vl_mm ); + bli_cntl_free_node( trsm_r_cntl_op_bp ); + bli_cntl_free_node( trsm_r_cntl_mm_op ); + bli_cntl_free_node( trsm_r_cntl_vl_mm ); } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/3/trsm/old/bli_trsm_cntl.h b/frame/3/trsm/old/bli_trsm_cntl.h index 651cc8599..bcdd1dfc7 100644 --- a/frame/3/trsm/old/bli_trsm_cntl.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -51,7 +51,7 @@ typedef struct trsm_s trsm_t; void bli_trsm_cntl_init( void ); void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, +trsm_t* bli_trsm_cntl_create_node( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalm_t* sub_scalm, diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 0f8e38688..63fc81711 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,7 +35,7 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -47,16 +47,39 @@ blksz_t* bli_blksz_obj_create b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); - bli_blksz_obj_init( b, - b_s, be_s, - b_d, be_d, - b_c, be_c, - b_z, be_z ); + bli_blksz_init_ed + ( + b, + b_s, be_s, + b_d, be_d, + b_c, be_c, + b_z, be_z + ); return b; } -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + blksz_t* b; + + b = ( blksz_t* ) bli_malloc_intl( sizeof(blksz_t) ); + + bli_blksz_init + ( + b, + b_s, b_d, b_c, b_z, + be_s, be_d, be_c, be_z + ); + + return b; +} + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -75,7 +98,45 @@ void bli_blksz_obj_init b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Interpret a zero as a request for the default value. + b->e[BLIS_FLOAT] = ( be_s == 0 ? b_s : be_s ); + b->e[BLIS_DOUBLE] = ( be_d == 0 ? b_d : be_d ); + b->e[BLIS_SCOMPLEX] = ( be_c == 0 ? b_c : be_c ); + b->e[BLIS_DCOMPLEX] = ( be_z == 0 ? b_z : be_z ); +} + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ) +{ + b->v[BLIS_FLOAT] = b_s; + b->v[BLIS_DOUBLE] = b_d; + b->v[BLIS_SCOMPLEX] = b_c; + b->v[BLIS_DCOMPLEX] = b_z; + + // Here we assume the maximum blocksize values can be the same as the + // default values. + b->e[BLIS_FLOAT] = b_s; + b->e[BLIS_DOUBLE] = b_d; + b->e[BLIS_SCOMPLEX] = b_c; + b->e[BLIS_DCOMPLEX] = b_z; +} + +void bli_blksz_free ( blksz_t* b ) diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index cfe2023e1..abd066f88 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -50,15 +50,6 @@ *(max) = bli_blksz_get_max( dt, b ); \ } -#define bli_blksz_get_def_for_obj( obj, b ) \ -\ - bli_blksz_get_def( bli_obj_datatype( *(obj) ), b ) - -#define bli_blksz_get_max_for_obj( obj, b ) \ -\ - bli_blksz_get_max( bli_obj_datatype( *(obj) ), b ) - - // blksz_t modification #define bli_blksz_set_def( val, dt, b ) \ @@ -85,8 +76,11 @@ #define bli_blksz_copy_dt( dt_src, b_src, \ dt_dst, b_dst ) \ { \ - (b_dst)->v[ dt_dst ] = (b_src)->v[ dt_src ]; \ - (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ + const dim_t v_src = bli_blksz_get_def( dt_src, b_src ); \ + const dim_t e_src = bli_blksz_get_max( dt_src, b_src ); \ +\ + bli_blksz_set_def( v_src, dt_dst, b_dst ); \ + bli_blksz_set_max( e_src, dt_dst, b_dst ); \ } #define bli_blksz_scale_def( num, den, dt, b ) \ @@ -109,7 +103,7 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create +blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, @@ -117,7 +111,13 @@ blksz_t* bli_blksz_obj_create dim_t b_z, dim_t be_z ); -void bli_blksz_obj_init +blksz_t* bli_blksz_create + ( + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, @@ -126,7 +126,20 @@ void bli_blksz_obj_init dim_t b_z, dim_t be_z ); -void bli_blksz_obj_free +void bli_blksz_init + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, + dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z + ); + +void bli_blksz_init_easy + ( + blksz_t* b, + dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z + ); + +void bli_blksz_free ( blksz_t* b ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index cac290da9..90b2634a5 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -34,8 +34,9 @@ #include "blis.h" -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, @@ -48,6 +49,7 @@ cntl_t* bli_cntl_obj_create // Allocate the cntl_t struct. cntl = bli_malloc_intl( sizeof( cntl_t ) ); + bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); bli_cntl_set_var_func( var_func, cntl ); bli_cntl_set_params( params, cntl ); @@ -63,7 +65,7 @@ cntl_t* bli_cntl_obj_create return cntl; } -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ) @@ -71,7 +73,7 @@ void bli_cntl_obj_free bli_free_intl( cntl ); } -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ) @@ -141,7 +143,7 @@ void bli_cntl_free_w_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } void bli_cntl_free_wo_thrinfo @@ -177,7 +179,7 @@ void bli_cntl_free_wo_thrinfo } // Free the current node. - bli_cntl_obj_free( cntl ); + bli_cntl_free_node( cntl ); } // ----------------------------------------------------------------------------- @@ -189,10 +191,11 @@ cntl_t* bli_cntl_copy { // Make a copy of the current node. Notice that the source node // should NOT have any allocated/cached mem_t entries, and that - // bli_cntl_obj_create() creates a node with a cleared mem_t + // bli_cntl_create_node() creates a node with a cleared mem_t // field. - cntl_t* cntl_copy = bli_cntl_obj_create + cntl_t* cntl_copy = bli_cntl_create_node ( + bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), NULL, NULL @@ -234,3 +237,23 @@ cntl_t* bli_cntl_copy return cntl_copy; } +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ) +{ + // Set the family of the root node. + bli_cntl_set_family( family, cntl ); + + // Continue as long as the current node has a valid child. + while ( bli_cntl_sub_node( cntl ) != NULL ) + { + // Move down the tree to the child node. + cntl = bli_cntl_sub_node( cntl ); + + // Set the family of the current node. + bli_cntl_set_family( family, cntl ); + } +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index fd0413f4f..332a6cd70 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -39,6 +39,7 @@ struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -57,20 +58,21 @@ typedef struct cntl_s cntl_t; // -- Control tree prototypes -- -cntl_t* bli_cntl_obj_create +cntl_t* bli_cntl_create_node ( + opid_t family, bszid_t bszid, void* var_func, void* params, cntl_t* sub_node ); -void bli_cntl_obj_free +void bli_cntl_free_node ( cntl_t* cntl ); -void bli_cntl_obj_clear +void bli_cntl_clear_node ( cntl_t* cntl ); @@ -99,10 +101,20 @@ cntl_t* bli_cntl_copy cntl_t* cntl ); +void bli_cntl_mark_family + ( + opid_t family, + cntl_t* cntl + ); + // ----------------------------------------------------------------------------- // cntl_t query (fields only) +#define bli_cntl_family( cntl ) \ +\ + ( cntl->family ) + #define bli_cntl_bszid( cntl ) \ \ ( cntl->bszid ) @@ -139,6 +151,11 @@ cntl_t* bli_cntl_copy // cntl_t modification +#define bli_cntl_set_family( family0, cntl ) \ +{ \ + cntl->family = family0; \ +} + #define bli_cntl_set_bszid( bszid0, cntl ) \ { \ cntl->bszid = bszid0; \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 29529924c..d4c4487ed 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -39,14 +39,14 @@ // NOTE: Since these functions currently do nothing, they are defined // as empty macros in bli_cntx. // -void bli_cntx_obj_create( cntx_t* cntx ) +void bli_cntx_create( cntx_t* cntx ) { // Since cntx_t objects contain statically-allocated arrays, // we don't need to do anything in order to create the cntx_t // instance. } -void bli_cntx_obj_free( cntx_t* cntx ) +void bli_cntx_free( cntx_t* cntx ) { // Just as we don't need to do anything in order to create a // cntx_t instance, we don't need to do anything to destory @@ -54,7 +54,7 @@ void bli_cntx_obj_free( cntx_t* cntx ) } #endif -void bli_cntx_obj_clear( cntx_t* cntx ) +void bli_cntx_clear( cntx_t* cntx ) { // Fill the entire cntx_t structure with zeros. memset( ( void* )cntx, 0, sizeof( cntx ) ); @@ -108,8 +108,11 @@ void bli_cntx_init( cntx_t* cntx ) // ----------------------------------------------------------------------------- -blksz_t* bli_cntx_get_blksz( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_blksz + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; @@ -142,8 +145,11 @@ dim_t bli_cntx_get_blksz_max_dt( num_t dt, } #endif -blksz_t* bli_cntx_get_bmult( bszid_t bs_id, - cntx_t* cntx ) +blksz_t* bli_cntx_get_bmult + ( + bszid_t bs_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -166,8 +172,11 @@ dim_t bli_cntx_get_bmult_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -210,8 +219,11 @@ void* bli_cntx_get_l3_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_vir_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_vir_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* l3_vir_ukr = &l3_vir_ukrs[ ukr_id ]; @@ -235,8 +247,11 @@ void* bli_cntx_get_l3_vir_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l3_nat_ukr( l3ukr_t ukr_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l3_nat_ukr + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* l3_nat_ukr = &l3_nat_ukrs[ ukr_id ]; @@ -260,8 +275,11 @@ void* bli_cntx_get_l3_nat_ukr_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1f_ker + ( + l1fkr_t ker_id, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); func_t* l1f_ker = &l1f_kers[ ker_id ]; @@ -283,8 +301,11 @@ void* bli_cntx_get_l1f_ker_dt( num_t dt, } #endif -func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, - cntx_t* cntx ) +func_t* bli_cntx_get_l1v_ker + ( + l1vkr_t ker_id, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); func_t* l1v_ker = &l1v_kers[ ker_id ]; @@ -306,8 +327,11 @@ void* bli_cntx_get_l1v_ker_dt( num_t dt, } #endif -mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, - cntx_t* cntx ) +mbool_t* bli_cntx_get_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* l3_nat_ukrs_pref = &l3_nat_ukrs_prefs[ ukr_id ]; @@ -316,12 +340,30 @@ mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, return l3_nat_ukrs_pref; } -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ) +func_t* bli_cntx_get_packm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) { - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + func_t* packm_ker = &packm_kers[ ker_id ]; // Return the address of the func_t that contains the packm ukernels. - return packm_ukrs; + return packm_ker; +} + +func_t* bli_cntx_get_unpackm_ker + ( + l1mkr_t ker_id, + cntx_t* cntx + ) +{ + func_t* unpackm_kers = bli_cntx_unpackm_kers_buf( cntx ); + func_t* unpackm_ker = &unpackm_kers[ ker_id ]; + + // Return the address of the func_t that contains the unpackm ukernels. + return unpackm_ker; } #if 0 @@ -360,7 +402,11 @@ dim_t bli_cntx_get_num_threads( cntx_t* cntx ) bli_cntx_ir_way( cntx ); } -dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +dim_t bli_cntx_get_num_threads_in + ( + cntx_t* cntx, + cntl_t* cntl + ) { dim_t n_threads_in = 1; @@ -384,14 +430,6 @@ dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) // ----------------------------------------------------------------------------- -#if 1 -// -// NOTE: This function is disabled because: -// - we currently do not have any need to set a context direclty with -// blksz_t objects -// - it may be broken; it needs to be synced up with the corresponding -// function in bli_gks.c. -// void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: @@ -454,8 +492,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the bszid_t of the multiple + // that we need to associate with the blksz_t object. bszid_t bs_id = va_arg( args, bszid_t ); blksz_t* blksz = va_arg( args, blksz_t* ); bszid_t bm_id = va_arg( args, bszid_t ); @@ -473,9 +511,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, - // - the address of the blksz_t object, and - // - the bszid_t of the multiple we need to associate with - // the blksz_t object. + // - the address of the blksz_t object, + // - the bszid_t of the multiple, and // - the scalars we wish to apply to the real blocksizes to // come up with the induced complex blocksizes (for default // and maximum blocksizes). @@ -536,6 +573,7 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // location within the context's blksz_t array. Do the same // for the blocksize multiple id. //cntx_blkszs[ bs_id ] = *blksz; + //bli_blksz_copy_smart( blksz, cntx_blksz ); bli_blksz_copy( blksz, cntx_blksz ); // Copy the blocksize multiple id into the context. @@ -624,14 +662,16 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( dsclrs ); bli_free_intl( msclrs ); } -#endif // ----------------------------------------------------------------------------- -void bli_cntx_set_blksz( bszid_t bs_id, - blksz_t* blksz, - bszid_t mult_id, - cntx_t* cntx ) +void bli_cntx_set_blksz + ( + bszid_t bs_id, + blksz_t* blksz, + bszid_t mult_id, + cntx_t* cntx + ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); @@ -645,20 +685,111 @@ void bli_cntx_set_blksz( bszid_t bs_id, bmults[ bs_id ] = mult_id; } -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) -{ - func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the specified location within - // the context's virtual level-3 ukernel array. - l3_vir_ukrs[ ukr_id ] = *func; +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_l3_nat_ukrs + ( + dim_t n_ukrs, + l3ukr_t ukr0_id, num_t dt0, void* ukr0_fp, bool_t pref0, + l3ukr_t ukr1_id, num_t dt1, void* ukr1_fp, bool_t pref1, + l3ukr_t ukr2_id, num_t dt2, void* ukr2_fp, bool_t pref2, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_ukrs ); + + // Process n_ukrs tuples. + for ( i = 0; i < n_ukrs; ++i ) + { + // Here, we query the variable argument list for: + // - the l3ukr_t of the kernel we're about to process, + // - the datatype of the kernel, + // - the kernel function pointer, and + // - the kernel function storage preference + // that we need to store to the context. + const l3ukr_t ukr_id = va_arg( args, l3ukr_t ); + const num_t ukr_dt = va_arg( args, num_t ); + void* ukr_fp = va_arg( args, void* ); + const bool_t ukr_pref = va_arg( args, bool_t ); + + // Store the values in our temporary arrays. + ukr_ids[ i ] = ukr_id; + ukr_dts[ i ] = ukr_dt; + ukr_fps[ i ] = ukr_fp; + ukr_prefs[ i ] = ukr_pref; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the addresses of: + // - the l3 native ukernel func_t array + // - the l3 native ukernel preferences array + func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); + mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_ukrs; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l3ukr_t ukr_id = ukr_ids[ i ]; + const num_t ukr_dt = ukr_dts[ i ]; + void* ukr_fp = ukr_fps[ i ]; + const bool_t ukr_pref = ukr_prefs[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; + mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); + bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); + } + + // Free the temporary local arrays. + bli_free_intl( ukr_ids ); + bli_free_intl( ukr_dts ); + bli_free_intl( ukr_fps ); + bli_free_intl( ukr_prefs ); } -void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_l3_nat_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) { func_t* l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); @@ -667,9 +798,12 @@ void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, l3_nat_ukrs[ ukr_id ] = *func; } -void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, - mbool_t* prefs, - cntx_t* cntx ) +void bli_cntx_set_l3_nat_ukr_prefs + ( + l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx + ) { mbool_t* l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); @@ -678,9 +812,26 @@ void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, l3_nat_ukrs_prefs[ ukr_id ] = *prefs; } -void bli_cntx_set_l1f_ker( l1fkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l3_vir_ukr + ( + l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); + + // Copy the function object into the specified location within + // the context's virtual level-3 ukernel array. + l3_vir_ukrs[ ukr_id ] = *func; +} + +void bli_cntx_set_l1f_ker + ( + l1fkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1f_kers = bli_cntx_l1f_kers_buf( cntx ); @@ -689,9 +840,12 @@ void bli_cntx_set_l1f_ker( l1fkr_t ker_id, l1f_kers[ ker_id ] = *func; } -void bli_cntx_set_l1v_ker( l1vkr_t ker_id, - func_t* func, - cntx_t* cntx ) +void bli_cntx_set_l1v_ker + ( + l1vkr_t ker_id, + func_t* func, + cntx_t* cntx + ) { func_t* l1v_kers = bli_cntx_l1v_kers_buf( cntx ); @@ -700,43 +854,154 @@ void bli_cntx_set_l1v_ker( l1vkr_t ker_id, l1v_kers[ ker_id ] = *func; } -void bli_cntx_set_packm_ukr( func_t* func, - cntx_t* cntx ) -{ - func_t* packm_ukrs = bli_cntx_packm_ukrs( cntx ); +// ----------------------------------------------------------------------------- - // Copy the function object into the context's packm ukernel object. - *packm_ukrs = *func; +void bli_cntx_set_packm_kers( dim_t n_kers, ... ) +{ + /* Example prototypes: + + void bli_cntx_set_packm_kers + ( + dim_t n_ukrs, + l1mkr_t ker0_id, num_t ker0_dt, void* ker0_fp, + l1mkr_t ker1_id, num_t ker1_dt, void* ker1_fp, + l1mkr_t ker2_id, num_t ker2_dt, void* ker2_fp, + ... + cntx_t* cntx + ); + */ + va_list args; + dim_t i; + + // Allocate some temporary local arrays. + l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) ); + num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); + + // -- Begin variable argument section -- + + // Initialize variable argument environment. + va_start( args, n_kers ); + + // Process n_kers tuples. + for ( i = 0; i < n_kers; ++i ) + { + // Here, we query the variable argument list for: + // - the l1mkr_t of the kernel we're about to process, + // - the datatype of the kernel, and + // - the kernel function pointer + // that we need to store to the context. + const l1mkr_t ker_id = va_arg( args, l1mkr_t ); + const num_t ker_dt = va_arg( args, num_t ); + void* ker_fp = va_arg( args, void* ); + + // Store the values in our temporary arrays. + ker_ids[ i ] = ker_id; + ker_dts[ i ] = ker_dt; + ker_fps[ i ] = ker_fp; + } + + // The last argument should be the context pointer. + cntx_t* cntx = va_arg( args, cntx_t* ); + + // Shutdown variable argument environment and clean up stack. + va_end( args ); + + // -- End variable argument section -- + + // Query the context for the address of: + // - the packm kernels func_t array + func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Now that we have the context address, we want to copy the values + // from the temporary buffers into the corresponding buffers in the + // context. + + // Process each blocksize id tuple provided. + for ( i = 0; i < n_kers; ++i ) + { + // Read the current blocksize id, blksz_t* pointer, blocksize + // multiple id, and blocksize scalar. + const l1mkr_t ker_id = ker_ids[ i ]; + const num_t ker_dt = ker_dts[ i ]; + void* ker_fp = ker_fps[ i ]; + + // Index into the func_t and mbool_t for the current kernel id + // being processed. + func_t* kers = &cntx_packm_kers[ ker_id ]; + + // Store the ukernel function pointer and preference values into + // the context. + bli_func_set_dt( ker_fp, ker_dt, kers ); + } + + // Free the temporary local arrays. + bli_free_intl( ker_ids ); + bli_free_intl( ker_dts ); + bli_free_intl( ker_fps ); } -void bli_cntx_set_ind_method( ind_t method, - cntx_t* cntx ) +// ----------------------------------------------------------------------------- + +void bli_cntx_set_packm_ker + ( + l1mkr_t ker_id, + func_t* func, + cntx_t* cntx + ) +{ + func_t* packm_kers = bli_cntx_packm_kers_buf( cntx ); + + // Copy the function object into the specified location within + // the context's packm kernel array. + packm_kers[ ker_id ] = *func; +} + +// ----------------------------------------------------------------------------- + +void bli_cntx_set_ind_method + ( + ind_t method, + cntx_t* cntx + ) { bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel + ( + pack_t schema_a, + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a_block( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block + ( + pack_t schema_a, + cntx_t* cntx + ) { bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel + ( + pack_t schema_b, + cntx_t* cntx + ) { bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_c_panel + ( + pack_t schema_c, + cntx_t* cntx + ) { bli_cntx_set_schema_c_panel( schema_c, cntx ); } @@ -749,8 +1014,15 @@ void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, } #endif -void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, - dim_t m, dim_t n, dim_t k ) +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + cntx_t* cntx, + dim_t m, + dim_t n, + dim_t k + ) { dim_t jc, pc, ic, jr, ir; @@ -882,9 +1154,12 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -894,9 +1169,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, return ukr_prefs == TRUE; } -bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { mbool_t* ukrs_prefs = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); bool_t ukr_prefs = bli_mbool_get_dt( dt, ukrs_prefs ); @@ -906,16 +1184,22 @@ bool_t bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, return ukr_prefs == FALSE; } -bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { const num_t dt = bli_obj_datatype( *obj ); const bool_t ukr_prefers_rows @@ -930,9 +1214,12 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -942,9 +1229,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -956,9 +1246,12 @@ bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, // ----------------------------------------------------------------------------- -bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_rows_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -968,9 +1261,12 @@ bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_cols_dt + ( + num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx + ) { // Reference the ukr storage preferences of the corresponding real // micro-kernel for induced methods. @@ -980,16 +1276,22 @@ bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); } -bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { num_t dt = bli_obj_datatype( *obj ); @@ -1005,9 +1307,12 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); @@ -1017,9 +1322,12 @@ bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, return r_val; } -bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, - l3ukr_t ukr_id, - cntx_t* cntx ) +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of + ( + obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx + ) { bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); @@ -1108,23 +1416,6 @@ void bli_cntx_print( cntx_t* cntx ) ); } - { - func_t* ukr = bli_cntx_get_packm_ukr( cntx ); - - printf( "packm ker : %16p %16p %16p %16p\n", - bli_func_get_dt( BLIS_FLOAT, ukr ), - bli_func_get_dt( BLIS_DOUBLE, ukr ), - bli_func_get_dt( BLIS_SCOMPLEX, ukr ), - bli_func_get_dt( BLIS_DCOMPLEX, ukr ) - ); - } - - { - ind_t family = bli_cntx_get_family( cntx ); - - printf( "oper family : %lu\n", ( guint_t )family ); - } - { ind_t method = bli_cntx_get_ind_method( cntx ); diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index a76cdd329..3167d1bf4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -36,6 +36,9 @@ #ifndef BLIS_CNTX_H #define BLIS_CNTX_H +//#include "bli_cntx_init.h" + + // Context object type (defined in bli_type_defs.h) /* @@ -51,9 +54,9 @@ typedef struct cntx_s func_t* l1f_kers; func_t* l1v_kers; - func_t packm_ukrs; + func_t* packm_kers; + func_t* unpackm_kers; - opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; @@ -99,17 +102,13 @@ typedef struct cntx_s \ ( (cntx)->l1v_kers ) -#define bli_cntx_packm_ukrs_buf( cntx ) \ +#define bli_cntx_packm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) + ( (cntx)->packm_kers ) -#define bli_cntx_packm_ukrs( cntx ) \ +#define bli_cntx_unpackm_kers_buf( cntx ) \ \ - (&((cntx)->packm_ukrs) ) - -#define bli_cntx_family( cntx ) \ -\ - ( (cntx)->family ) + ( (cntx)->unpackm_kers ) #define bli_cntx_method( cntx ) \ \ @@ -202,16 +201,6 @@ typedef struct cntx_s (cntx_p)->l1v_kers = _l1v_kers; \ } -#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ -{ \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ -} - -#define bli_cntx_set_family( _family, cntx_p ) \ -{ \ - (cntx_p)->family = _family; \ -} - #define bli_cntx_set_method( _method, cntx_p ) \ { \ (cntx_p)->method = _method; \ @@ -285,7 +274,8 @@ typedef struct cntx_s ( \ (dt), \ &(( \ - bli_cntx_method( (cntx) ) != BLIS_NAT \ + bli_cntx_method( (cntx) ) != BLIS_NAT && \ + bli_is_complex( dt ) \ ? bli_cntx_l3_vir_ukrs_buf( (cntx) ) \ : bli_cntx_l3_nat_ukrs_buf( (cntx) ) \ )[ ukr_id ]) \ @@ -326,10 +316,6 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) -#define bli_cntx_get_family( cntx ) \ -\ - bli_cntx_family( cntx ) - #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -357,9 +343,9 @@ typedef struct cntx_s // create/free -//void bli_cntx_obj_create( cntx_t* cntx ); -//void bli_cntx_obj_free( cntx_t* cntx ); -void bli_cntx_obj_clear( cntx_t* cntx ); +//void bli_cntx_create( cntx_t* cntx ); +//void bli_cntx_free( cntx_t* cntx ); +void bli_cntx_clear( cntx_t* cntx ); void bli_cntx_init( cntx_t* cntx ); // get functions @@ -380,7 +366,7 @@ func_t* bli_cntx_get_l1f_ker( l1fkr_t ker_id, cntx_t* cntx ); func_t* bli_cntx_get_l1v_ker( l1vkr_t ker_id, cntx_t* cntx ); -func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); +//func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //dim_t bli_cntx_get_blksz_def_dt( num_t dt, // bszid_t bs_id, @@ -409,6 +395,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //void* bli_cntx_get_l1v_ker_dt( num_t dt, // l1vkr_t ker_id, // cntx_t* cntx ); +func_t* bli_cntx_get_packm_ker( l1mkr_t ker_id, + cntx_t* cntx ); +func_t* bli_cntx_get_unpackm_ker( l1mkr_t ker_id, + cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); @@ -425,18 +415,34 @@ void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ); -void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, - func_t* func, - cntx_t* cntx ); + +void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); + void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ); +void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, + mbool_t* prefs, + cntx_t* cntx ); + +void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ); + void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ); + +void bli_cntx_set_packm_kers( dim_t n_kers, ... ); + +void bli_cntx_set_packm_ker( l1mkr_t ker_id, + func_t* func, + cntx_t* cntx ); + void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, @@ -507,11 +513,11 @@ void bli_cntx_print( cntx_t* cntx ); // Preprocess out these calls entirely, since they are currently just empty // functions that do nothing. #if 0 - #define bli_cntx_obj_create( cntx ) { bli_cntx_obj_clear( cntx ); } - #define bli_cntx_obj_free( cntx ) { bli_cntx_obj_clear( cntx ); } + #define bli_cntx_create( cntx ) { bli_cntx_clear( cntx ); } + #define bli_cntx_free( cntx ) { bli_cntx_clear( cntx ); } #else - #define bli_cntx_obj_create( cntx ) { ; } - #define bli_cntx_obj_free( cntx ) { ; } + #define bli_cntx_create( cntx ) { ; } + #define bli_cntx_free( cntx ) { ; } #endif // These macros initialize/finalize a local context if the given context diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index 75be26085..d098b4c9d 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -35,37 +35,57 @@ #include "blis.h" -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { func_t* f; f = ( func_t* ) bli_malloc_intl( sizeof(func_t) ); - bli_func_obj_init( f, - ptr_s, - ptr_d, - ptr_c, - ptr_z ); + bli_func_init + ( + f, + ptr_s, + ptr_d, + ptr_c, + ptr_z + ); return f; } -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ) +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ) { - f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s; - f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d; - f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c; - f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z; + bli_func_set_dt( ptr_s, BLIS_FLOAT, f ); + bli_func_set_dt( ptr_d, BLIS_DOUBLE, f ); + bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f ); + bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f ); } -void bli_func_obj_free( func_t* f ) +void bli_func_init_null + ( + func_t* f + ) +{ + bli_func_set_dt( NULL, BLIS_FLOAT, f ); + bli_func_set_dt( NULL, BLIS_DOUBLE, f ); + bli_func_set_dt( NULL, BLIS_SCOMPLEX, f ); + bli_func_set_dt( NULL, BLIS_DCOMPLEX, f ); +} + +void bli_func_free( func_t* f ) { bli_free_intl( f ); } @@ -75,7 +95,7 @@ void bli_func_obj_free( func_t* f ) bool_t bli_func_is_null_dt( num_t dt, func_t* f ) { - return ( f->ptr[ dt ] == NULL ); + return ( bli_func_get_dt( dt, f ) == NULL ); } bool_t bli_func_is_null( func_t* f ) @@ -87,7 +107,7 @@ bool_t bli_func_is_null( func_t* f ) // return FALSE. Otherwise, if they are all null, return TRUE. for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { - if ( f->ptr[ dt ] != NULL ) + if ( bli_func_get_dt( dt, f ) != NULL ) { r_val = FALSE; break; diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h index 56b221be9..2bfc2ad20 100644 --- a/frame/base/bli_func.h +++ b/frame/base/bli_func.h @@ -49,18 +49,29 @@ // ----------------------------------------------------------------------------- -func_t* bli_func_obj_create( void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +func_t* bli_func_create + ( + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_init( func_t* f, - void* ptr_s, - void* ptr_d, - void* ptr_c, - void* ptr_z ); +void bli_func_init + ( + func_t* f, + void* ptr_s, + void* ptr_d, + void* ptr_c, + void* ptr_z + ); -void bli_func_obj_free( func_t* f ); +void bli_func_init_null + ( + func_t* f + ); + +void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 2ada1556e..4d819babe 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -74,12 +74,6 @@ static blksz_t bli_gks_blkszs[BLIS_NUM_BLKSZS] = /* df */ { { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, }, { BLIS_DEFAULT_DF_S, BLIS_DEFAULT_DF_C, BLIS_DEFAULT_DF_D, BLIS_DEFAULT_DF_Z, } }, -/* xf */ { { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, }, - { BLIS_DEFAULT_XF_S, BLIS_DEFAULT_XF_C, BLIS_DEFAULT_XF_D, BLIS_DEFAULT_XF_Z, } - }, -/* vf */ { { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, }, - { BLIS_DEFAULT_VF_S, BLIS_DEFAULT_VF_C, BLIS_DEFAULT_VF_D, BLIS_DEFAULT_VF_Z, } - }, }; // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_mbool.c b/frame/base/bli_mbool.c index 46ba531bc..6906622d1 100644 --- a/frame/base/bli_mbool.c +++ b/frame/base/bli_mbool.c @@ -35,29 +35,38 @@ #include "blis.h" -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { mbool_t* b; b = ( mbool_t* ) bli_malloc_intl( sizeof(mbool_t) ); - bli_mbool_obj_init( b, - b_s, - b_d, - b_c, - b_z ); + bli_mbool_init + ( + b, + b_s, + b_d, + b_c, + b_z + ); return b; } -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ) +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ) { bli_mbool_set_dt( b_s, BLIS_FLOAT, b ); bli_mbool_set_dt( b_d, BLIS_DOUBLE, b ); @@ -65,7 +74,7 @@ void bli_mbool_obj_init( mbool_t* b, bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b ); } -void bli_mbool_obj_free( mbool_t* b ) +void bli_mbool_free( mbool_t* b ) { bli_free_intl( b ); } diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h index 5d5f47828..181543413 100644 --- a/frame/base/bli_mbool.h +++ b/frame/base/bli_mbool.h @@ -49,16 +49,22 @@ // ----------------------------------------------------------------------------- -mbool_t* bli_mbool_obj_create( bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +mbool_t* bli_mbool_create + ( + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_init( mbool_t* b, - bool_t b_s, - bool_t b_d, - bool_t b_c, - bool_t b_z ); +void bli_mbool_init + ( + mbool_t* b, + bool_t b_s, + bool_t b_d, + bool_t b_c, + bool_t b_z + ); -void bli_mbool_obj_free( mbool_t* b ); +void bli_mbool_free( mbool_t* b ); diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 1a120d5da..d71d84f31 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -631,6 +631,80 @@ typedef enum #define BLIS_NUM_LEVEL1F_KERS 5 +typedef enum +{ + BLIS_PACKM_0XK_KER = 0, + BLIS_PACKM_1XK_KER = 1, + BLIS_PACKM_2XK_KER = 2, + BLIS_PACKM_3XK_KER = 3, + BLIS_PACKM_4XK_KER = 4, + BLIS_PACKM_5XK_KER = 5, + BLIS_PACKM_6XK_KER = 6, + BLIS_PACKM_7XK_KER = 7, + BLIS_PACKM_8XK_KER = 8, + BLIS_PACKM_9XK_KER = 9, + BLIS_PACKM_10XK_KER = 10, + BLIS_PACKM_11XK_KER = 11, + BLIS_PACKM_12XK_KER = 12, + BLIS_PACKM_13XK_KER = 13, + BLIS_PACKM_14XK_KER = 14, + BLIS_PACKM_15XK_KER = 15, + BLIS_PACKM_16XK_KER = 16, + BLIS_PACKM_17XK_KER = 17, + BLIS_PACKM_18XK_KER = 18, + BLIS_PACKM_19XK_KER = 19, + BLIS_PACKM_20XK_KER = 20, + BLIS_PACKM_21XK_KER = 21, + BLIS_PACKM_22XK_KER = 22, + BLIS_PACKM_23XK_KER = 23, + BLIS_PACKM_24XK_KER = 24, + BLIS_PACKM_25XK_KER = 25, + BLIS_PACKM_26XK_KER = 26, + BLIS_PACKM_27XK_KER = 27, + BLIS_PACKM_28XK_KER = 28, + BLIS_PACKM_29XK_KER = 29, + BLIS_PACKM_30XK_KER = 30, + BLIS_PACKM_31XK_KER = 31, + + BLIS_UNPACKM_0XK_KER = 0, + BLIS_UNPACKM_1XK_KER = 1, + BLIS_UNPACKM_2XK_KER = 2, + BLIS_UNPACKM_3XK_KER = 3, + BLIS_UNPACKM_4XK_KER = 4, + BLIS_UNPACKM_5XK_KER = 5, + BLIS_UNPACKM_6XK_KER = 6, + BLIS_UNPACKM_7XK_KER = 7, + BLIS_UNPACKM_8XK_KER = 8, + BLIS_UNPACKM_9XK_KER = 9, + BLIS_UNPACKM_10XK_KER = 10, + BLIS_UNPACKM_11XK_KER = 11, + BLIS_UNPACKM_12XK_KER = 12, + BLIS_UNPACKM_13XK_KER = 13, + BLIS_UNPACKM_14XK_KER = 14, + BLIS_UNPACKM_15XK_KER = 15, + BLIS_UNPACKM_16XK_KER = 16, + BLIS_UNPACKM_17XK_KER = 17, + BLIS_UNPACKM_18XK_KER = 18, + BLIS_UNPACKM_19XK_KER = 19, + BLIS_UNPACKM_20XK_KER = 20, + BLIS_UNPACKM_21XK_KER = 21, + BLIS_UNPACKM_22XK_KER = 22, + BLIS_UNPACKM_23XK_KER = 23, + BLIS_UNPACKM_24XK_KER = 24, + BLIS_UNPACKM_25XK_KER = 25, + BLIS_UNPACKM_26XK_KER = 26, + BLIS_UNPACKM_27XK_KER = 27, + BLIS_UNPACKM_28XK_KER = 28, + BLIS_UNPACKM_29XK_KER = 29, + BLIS_UNPACKM_30XK_KER = 30, + BLIS_UNPACKM_31XK_KER = 31, + +} l1mkr_t; + +#define BLIS_NUM_PACKM_KERS 32 +#define BLIS_NUM_UNPACKM_KERS 32 + + typedef enum { BLIS_GEMM_UKR = 0, @@ -683,7 +757,7 @@ typedef enum // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in -// bli_ind_query.c to index into arrays. +// bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_HEMM, @@ -714,16 +788,14 @@ typedef enum BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension - BLIS_1F, // level-1f global fusing factor BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_VF, // level-1v vector fusing factor BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. } bszid_t; -#define BLIS_NUM_BLKSZS 13 +#define BLIS_NUM_BLKSZS 11 // @@ -784,6 +856,7 @@ typedef struct mem_s struct cntl_s { // Basic fields (usually required). + opid_t family; bszid_t bszid; void* var_func; struct cntl_s* sub_node; @@ -971,9 +1044,9 @@ typedef struct cntx_s func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; - func_t packm_ukrs; + func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; + func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; - opid_t family; ind_t method; pack_t schema_a_block; pack_t schema_b_panel; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index 5b7a70c3c..03a4d4d91 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -122,7 +122,7 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -170,7 +170,7 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M2; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -218,7 +218,7 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M3; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -279,7 +279,7 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -343,7 +343,7 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -391,7 +391,7 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1B; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -439,7 +439,7 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4MH; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -524,7 +524,7 @@ void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 96f9add60..d3127b81f 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -41,7 +41,7 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_3M1; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -89,7 +89,7 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_4M1A; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. @@ -137,7 +137,7 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) const ind_t method = BLIS_1M; // Clear the context fields. - bli_cntx_obj_clear( cntx ); + bli_cntx_clear( cntx ); // Initialize the context with the current architecture's native // level-3 gemm micro-kernel, and its output preferences. diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 5777c5b6d..131f70973 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -207,6 +207,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -234,7 +235,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -252,7 +253,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 27fb37e6a..16ef5a157 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -132,6 +132,7 @@ void* bli_l3_thread_entry( void* data_void ); typedef struct thread_data { l3int_t func; + opid_t family; obj_t* alpha; obj_t* a; obj_t* b; @@ -148,6 +149,7 @@ void* bli_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; + opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; obj_t* b = data->b; @@ -162,13 +164,14 @@ void* bli_l3_thread_entry( void* data_void ) thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); data->func ( + family, alpha, a, b, @@ -180,7 +183,7 @@ void* bli_l3_thread_entry( void* data_void ) ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); @@ -191,6 +194,7 @@ void* bli_l3_thread_entry( void* data_void ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -217,6 +221,7 @@ void bli_l3_thread_decorator { // Set up thread data for additional threads (beyond thread 0). datas[id].func = func; + datas[id].family = family; datas[id].alpha = alpha; datas[id].a = a; datas[id].b = b; diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index 76b48ca95..cb0bc2ae4 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -74,6 +74,7 @@ void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, @@ -94,7 +95,7 @@ void bli_l3_thread_decorator thrinfo_t* thread; // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -112,7 +113,7 @@ void bli_l3_thread_decorator ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 1dde88206..2d150c656 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -218,9 +218,10 @@ siz_t bli_thread_get_range_l2r dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, FALSE, start, end ); @@ -237,9 +238,10 @@ siz_t bli_thread_get_range_r2l dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, n, bf, TRUE, start, end ); @@ -256,9 +258,10 @@ siz_t bli_thread_get_range_t2b dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, FALSE, start, end ); @@ -275,9 +278,10 @@ siz_t bli_thread_get_range_b2t dim_t* end ) { + num_t dt = bli_obj_datatype( *a ); dim_t m = bli_obj_length_after_trans( *a ); dim_t n = bli_obj_width_after_trans( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_get_range_sub( thr, m, bf, TRUE, start, end ); @@ -649,7 +653,7 @@ siz_t bli_thread_get_range_mdim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -708,7 +712,7 @@ siz_t bli_thread_get_range_ndim ) { bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntx_get_family( cntx ); + opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires @@ -771,11 +775,12 @@ siz_t bli_thread_get_range_weighted_l2r if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -820,11 +825,12 @@ siz_t bli_thread_get_range_weighted_r2l if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -871,11 +877,12 @@ siz_t bli_thread_get_range_weighted_t2b if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) @@ -922,11 +929,12 @@ siz_t bli_thread_get_range_weighted_b2t if ( bli_obj_intersects_diag( *a ) && bli_obj_is_upper_or_lower( *a ) ) { + num_t dt = bli_obj_datatype( *a ); doff_t diagoff = bli_obj_diag_offset( *a ); uplo_t uplo = bli_obj_uplo( *a ); dim_t m = bli_obj_length( *a ); dim_t n = bli_obj_width( *a ); - dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); + dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( *a ) ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 9092bc84d..a88d24bc0 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -155,6 +155,7 @@ typedef void (*l3int_t) void bli_l3_thread_decorator ( l3int_t func, + opid_t family, obj_t* alpha, obj_t* a, obj_t* b, diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 993c134b4..84552b569 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -1903,7 +1903,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia else does_inv_diag = TRUE; // Create a control tree node for the packing operation. - cntl_t* cntl = bli_packm_cntl_obj_create + cntl_t* cntl = bli_packm_cntl_create_node ( NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bli_packm_blk_var1, From 803bbef0a386dd0571ad389f69d55154dbfe3c50 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 29 Jul 2017 20:17:05 -0500 Subject: [PATCH 46/64] Fixed pthreads compile bug with previous commit. Details: - Erroneously passed family parameter into l3int_t function despite that function not taking the parameter. Oops. --- frame/thread/bli_thrcomm_pthreads.c | 1 - 1 file changed, 1 deletion(-) diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 16ef5a157..540e161c8 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -171,7 +171,6 @@ void* bli_l3_thread_entry( void* data_void ) data->func ( - family, alpha, a, b, From cecdc05d2834786a84ff85775d3f99a958c0765a Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 31 Jul 2017 15:19:51 -0500 Subject: [PATCH 47/64] Change lsame_ signature to match lapacke. --- frame/compat/f2c/bla_lsame.c | 11 ++++++++--- frame/compat/f2c/bla_lsame.h | 6 +++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c index 04f8caad0..7b109ab43 100644 --- a/frame/compat/f2c/bla_lsame.c +++ b/frame/compat/f2c/bla_lsame.c @@ -41,7 +41,12 @@ -lf2c -lm (in that order) */ -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len) + +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len) +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len) +#endif { /* System generated locals */ bla_logical ret_val; @@ -115,11 +120,11 @@ bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, f /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ /* upper case 'Z'. */ - if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta + if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) { inta += 64; } - if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb + if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) { intb += 64; } diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h index 7e2f92389..e8f63f488 100644 --- a/frame/compat/f2c/bla_lsame.h +++ b/frame/compat/f2c/bla_lsame.h @@ -34,6 +34,10 @@ #ifdef BLIS_ENABLE_BLAS2BLIS -bla_logical PASTEF770(lsame)(const bla_character *ca, const bla_character *cb, ftnlen ca_len, ftnlen cb_len); +#ifdef LAPACK_ILP64 +long PASTEF770(lsame)(char *ca, char *cb, long ca_len, long cb_len); +#else +int PASTEF770(lsame)(char *ca, char *cb, int ca_len, int cb_len); +#endif #endif From b01c80829907d50ec79977fba8e7b53cfe7db80a Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Fri, 4 Aug 2017 14:17:44 -0500 Subject: [PATCH 48/64] Fixed a minor bug in level-3 packm management. Details: - Fixed a bug in bli_l3_packm() that caused cntl_t-cached packed mem_t entries to be released and then re-acquired unnecessarily. (In essence, the "<" operands in the conditional that guards the release-and-reacquire code block simply needed to be swapped.) The bug should have only affected performance (rather than the computed result). Thanks to Minh Quan for identifying and reporting the bug. --- frame/3/bli_l3_packm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 28fb1f857..82383f93a 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -115,12 +115,13 @@ void bli_l3_packm // buffer, then a block has already been acquired from the memory // broker and cached in the control tree. - // BUT, we need to make sure that the mem_t object is not associated - // with a block that is too small given the size of the packed matrix - // that we need, according to the return value from packm_init(). + // As a sanity check, we should make sure that the mem_t object isn't + // associated with a block that is too small compared to the size of + // the packed matrix buffer that is needed, according to the return + // value from packm_init(). siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); - if ( size_needed < cntl_mem_size ) + if ( cntl_mem_size < size_needed ) { if ( bli_thread_am_ochief( thread ) ) { From 60a1eeb2317939d732b9eb6ff1e0d6d668c9a1e5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 5 Aug 2017 13:04:31 -0500 Subject: [PATCH 49/64] Added edge handling to _determine_blocksize_b(). Details: - Added explicit handling of situations where i == dim to bli_determine_blocksize_b_sub(). This isn't actually needed by any current use case within BLIS, but handling the situation is nonetheless prudent. Thanks to Minh Quan for reporting this issue and requesting the fix. --- frame/base/bli_blksz.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 63fc81711..6d27c52d5 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -363,6 +363,11 @@ dim_t bli_determine_blocksize_b_sub // chunk that will correspond to the blocksize we are computing now. dim_left_now = dim - i; + // Sanity check: if dim_left_now is zero, then we can return zero + // without going any further. + if ( dim_left_now == 0 ) + return 0; + dim_at_edge = dim_left_now % b_alg; // If dim_left_now is a multiple of b_alg, we can safely return b_alg From f86ce54d6f315006984534fe29e47a2deaacc9f5 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 10 Aug 2017 16:24:28 -0500 Subject: [PATCH 50/64] Removed trailing enum commas from bli_type_defs.h. Details: - Removed trailing commas from enums in bli_type_defs.h. Thanks to Erling Andersen for pointing out this inconsistency and suggesting the change. --- frame/include/bli_type_defs.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d71d84f31..517a17b13 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -438,7 +438,7 @@ typedef enum BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, - BLIS_DT_HI = BLIS_DCOMPLEX, + BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum @@ -482,7 +482,7 @@ typedef enum BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start @@ -511,7 +511,7 @@ typedef enum BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, - BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE, + BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; @@ -590,7 +590,7 @@ typedef enum BLIS_4M1B, BLIS_4M1A, BLIS_1M, - BLIS_NAT, + BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) @@ -613,7 +613,7 @@ typedef enum BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, - BLIS_XPBYV_KER, + BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 @@ -625,7 +625,7 @@ typedef enum BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, - BLIS_DOTXAXPYF_KER, + BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 @@ -697,7 +697,7 @@ typedef enum BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, - BLIS_UNPACKM_31XK_KER = 31, + BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; @@ -711,7 +711,7 @@ typedef enum BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR, + BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 @@ -722,7 +722,7 @@ typedef enum BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, - BLIS_NOTAPPLIC_UKERNEL, + BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 @@ -736,7 +736,7 @@ typedef enum BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, - BLIS_PR_IDX, + BLIS_PR_IDX } thridx_t; #endif @@ -770,7 +770,7 @@ typedef enum BLIS_TRMM, BLIS_TRSM, - BLIS_NOID, + BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 10 @@ -792,7 +792,7 @@ typedef enum BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor - BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. + BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 @@ -1065,7 +1065,7 @@ typedef struct cntx_s typedef enum { BLIS_NO_ERROR_CHECKING = 0, - BLIS_FULL_ERROR_CHECKING, + BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum From 7dc78b49f97e6b3cd6d72fcdc588ace534d0e700 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Tue, 15 Aug 2017 10:02:25 -0500 Subject: [PATCH 51/64] Add vzeroupper to Intel AVX kernels. --- kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c | 4 ++++ kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c | 4 ++++ kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c | 8 ++++++++ kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c | 8 ++++++++ kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c | 10 +++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c index 5bc2dd4ba..2088e030a 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d12x4.c @@ -634,6 +634,8 @@ void bli_sgemm_asm_24x4 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1253,6 +1255,8 @@ void bli_dgemm_asm_12x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c index c92612b07..5eb0f0732 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d4x12.c @@ -600,6 +600,8 @@ void bli_sgemm_asm_4x24 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1186,6 +1188,8 @@ void bli_dgemm_asm_4x12 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index 5bd2d92e5..78b294053 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -595,6 +595,8 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1175,6 +1177,8 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1702,6 +1706,8 @@ void bli_cgemm_asm_3x8 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2228,6 +2234,8 @@ void bli_zgemm_asm_3x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c index f173947c3..9796e27ef 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d8x6.c @@ -596,6 +596,8 @@ void bli_sgemm_asm_16x6 " \n\t" " \n\t" ".SDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1176,6 +1178,8 @@ void bli_dgemm_asm_8x6 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -1703,6 +1707,8 @@ void bli_cgemm_asm_8x3 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2229,6 +2235,8 @@ void bli_zgemm_asm_4x3 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index f8db398ca..f19f053fc 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -991,7 +991,9 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" ".SDONE: \n\t" - " \n\t" + " \n\t" + "vzeroupper \n\t" + " \n\t" : // output operands (none) : // input operands @@ -1658,6 +1660,8 @@ void bli_dgemm_asm_8x4 " \n\t" " \n\t" ".DDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -2611,6 +2615,8 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" ".CDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) @@ -3453,6 +3459,8 @@ void bli_zgemm_asm_4x4 " \n\t" " \n\t" ".ZDONE: \n\t" + " \n\t" + "vzeroupper \n\t" " \n\t" : // output operands (none) From d1ee776202b26874333af7a91b6d2686342c4c81 Mon Sep 17 00:00:00 2001 From: sthangar Date: Wed, 23 Aug 2017 13:01:14 +0530 Subject: [PATCH 52/64] Adding auto hardware detection for Zen Change-Id: I40ce6705dd66b35000c4ccddffad1c5b65998caf --- build/auto-detect/cpuid_x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/build/auto-detect/cpuid_x86.c b/build/auto-detect/cpuid_x86.c index 07ca004f0..72630f471 100644 --- a/build/auto-detect/cpuid_x86.c +++ b/build/auto-detect/cpuid_x86.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2015, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -47,6 +48,7 @@ #define CPUNAME_KNC 5 #define CPUNAME_BULLDOZER 6 #define CPUNAME_PILEDRIVER 7 +#define CPUNAME_ZEN 8 static char *cpuname[] = { "reference", @@ -57,6 +59,7 @@ static char *cpuname[] = { "mic", "bulldozer", "piledriver", + "zen", }; #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -227,6 +230,14 @@ int cpu_detect() else return CPUNAME_REFERENCE; //OS don't support AVX. } + case 8: + switch (model){ + case 1: + if(support_avx()) + return CPUNAME_ZEN; + else + return CPUNAME_REFERENCE; //OS don't support AVX. + } } break; } From e056d810d16621891ead032603de0c2105cfc0f7 Mon Sep 17 00:00:00 2001 From: sthangar Date: Mon, 28 Aug 2017 16:44:42 +0530 Subject: [PATCH 53/64] Bug fix for the testsuite build failing Change-Id: I7cd8c9d187387c48b2564e45cbfb8df985e93d77 --- frame/2/bli_l2_cntx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index dc5020c8d..fc3624495 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -83,9 +83,8 @@ void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ { \ /* Free the context and all memory allocated to it. */ \ - bli_cntx_obj_free( cntx ); \ + bli_cntx_free( cntx ); \ } - GENFRONT( gemv ) #undef GENFRONT @@ -94,7 +93,7 @@ GENFRONT( gemv ) void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ - bli_cntx_obj_create( cntx ); \ + bli_cntx_create( cntx ); \ \ /* Initialize the context with kernels employed by the current operation. */ \ From 8e917b256ca2d4bcdc059fe98d86be8775c69561 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 9 Sep 2017 14:10:15 -0500 Subject: [PATCH 54/64] Updated bibtex info for BLIS5 (3m4m) article. --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6b0389bae..01dd2958a 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis) + Introduction ------------ @@ -285,10 +286,15 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called ``` @article{BLIS5, author = {Field G. {V}an~{Z}ee and Tyler Smith}, - title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods}, + title = {Implementing High-performance Complex Matrix Multiplication via the 3m and 4m Methods}, journal = {ACM Transactions on Mathematical Software}, + volume = {44}, + number = {1}, + pages = {7:1--7:36}, + month = jul, year = {2017}, - note = {accepted} + issue_date = {July 2017}, + url = {http://doi.acm.org/10.1145/3086466}, } ``` @@ -298,7 +304,7 @@ article and derives a [superior induced method](http://www.cs.utexas.edu/users/f ``` @article{BLIS6, author = {Field G. {V}an~{Z}ee}, - title = {Implementing high-performance complex matrix multiplication via the 1m method}, + title = {Implementing High-Performance Complex Matrix Multiplication via the 1m Method}, journal = {ACM Transactions on Mathematical Software}, note = {submitted} } From f5962a1aae0fb3c9be104d0035c0d73210e7f670 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 26 Sep 2017 17:00:04 -0500 Subject: [PATCH 55/64] Fixed bugs in gemm/gemmtrsm ukr tests in testsuite. Details: - Fixed a bug in gemmtrsm test module that was due to improper partitioning into a k x k triangular matrix for the purposes of obtaining an mr x k micropanel of A with which to test. - Fixed a bug in gemm and gemmtrsm test modules that would only manifest for very large k (depending on the product of mr x kc on that architecture). The bug arose from the fact that the test module was triggering the allocation of blocks from the internal memory pools, which are limited in size. This allocation imposes an implicit assumption that the micro- panel being tested with will fit inside, and this assumption is violated for large values of k. Arbitrarily large k may now be tested for both operation tests. - Added OpenMP/pthread critical sections around the setting or getting of statuses from the induced method operation lookup table in bli_l3_ind.c. - Added the 'static' keyword to all pthread_mutex_t global variables in BLIS. - Thanks to Nisanth Padinharepatt of AMD for reporting the first and third issues. --- frame/base/bli_init.c | 2 +- frame/base/bli_memsys.c | 2 +- frame/ind/bli_l3_ind.c | 43 +++++++++++++++++++-- testsuite/src/test_gemm_ukr.c | 35 +++++++++++++++++ testsuite/src/test_gemmtrsm_ukr.c | 63 +++++++++++++++++++++++++++---- 5 files changed, 133 insertions(+), 12 deletions(-) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index db598cede..3a9fb55e9 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -35,7 +35,7 @@ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS -pthread_mutex_t initialize_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t initialize_mutex = PTHREAD_MUTEX_INITIALIZER; #endif static bool_t bli_is_init = FALSE; diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index 06cbae587..8bb8c7263 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -36,7 +36,7 @@ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS -pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; +static thread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; #endif static membrk_t global_membrk; diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index e694f5384..0bf624a1e 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -200,6 +200,10 @@ void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool_t status ) // ----------------------------------------------------------------------------- +#ifdef BLIS_ENABLE_PTHREADS +static pthread_mutex_t l3_ind_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool_t status ) { num_t idt; @@ -212,14 +216,47 @@ void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool_t sta idt = bli_ind_map_cdt_to_index( dt ); - bli_l3_ind_oper_st[ method ][ oper ][ idt ] = status; +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (l3_ind)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &l3_ind_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + bli_l3_ind_oper_st[ method ][ oper ][ idt ] = status; + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &l3_ind_mutex ); +#endif } bool_t bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ) { - num_t idt = bli_ind_map_cdt_to_index( dt ); + num_t idt = bli_ind_map_cdt_to_index( dt ); + bool_t r_val; - return bli_l3_ind_oper_st[ method ][ oper ][ idt ]; +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (l3_ind)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &l3_ind_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + r_val = bli_l3_ind_oper_st[ method ][ oper ][ idt ]; + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &l3_ind_mutex ); +#endif + + return r_val; } // ----------------------------------------------------------------------------- diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index f418ac6e5..d5061227a 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -162,6 +162,7 @@ void libblis_test_gemm_ukr_experiment double time; dim_t m, n, k; + inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -182,6 +183,11 @@ void libblis_test_gemm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, &cntx ); + // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, + // respectively. + ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, &cntx ); + ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, &cntx ); + // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -220,6 +226,7 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); +#if 0 // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -242,6 +249,32 @@ void libblis_test_gemm_ukr_experiment &b, &bp, &cntx ); +#endif + + // Create the packed objects. Use packmr and packnr as the leading + // dimensions of ap and bp, respectively. + bli_obj_create( datatype, m, k, 1, ldap, &ap ); + bli_obj_create( datatype, k, n, ldbp, 1, &bp ); + + // Set up the objects for packing. Calling packm_init_pack() does everything + // except checkout a memory pool block and save its address to the obj_t's. + // However, it does overwrite the buffer field of packed object with that of + // the source object. So, we have to save the buffer address that was + // allocated. + void* buf_ap = bli_obj_buffer( ap ); + void* buf_bp = bli_obj_buffer( bp ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_MR, BLIS_KR, &a, &ap, &cntx ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_KR, BLIS_NR, &b, &bp, &cntx ); + bli_obj_set_buffer( buf_ap, ap ); + bli_obj_set_buffer( buf_bp, bp ); + + // Pack the data from the source objects. + bli_packm_blk_var1( &a, &ap, &cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -267,10 +300,12 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); +#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); +#endif // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 172ff053a..1232a7c12 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -181,6 +181,7 @@ void libblis_test_gemmtrsm_ukr_experiment double time; dim_t m, n, k; + inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -207,6 +208,11 @@ void libblis_test_gemmtrsm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, &cntx ); + // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, + // respectively. + ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, &cntx ); + ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, &cntx ); + // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -249,17 +255,24 @@ void libblis_test_gemmtrsm_ukr_experiment // Normalize B and save. libblis_test_mobj_randomize( params, TRUE, &b ); - // Use the last m rows of A_big as A. - bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &a_big, &a ); - - // Locate the B11 block of B, copy to C11, and save. - if ( bli_obj_is_lower( a ) ) + // Locate A1x/A11 (lower) or Ax1/A11 (upper), and then locate the + // corresponding B11 block of B. + if ( bli_obj_is_lower( a_big ) ) + { + bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &a_big, &a ); bli_acquire_mpart_t2b( BLIS_SUBPART1, k, m, &b, &b11 ); + } else + { + bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &a_big, &a ); bli_acquire_mpart_t2b( BLIS_SUBPART1, 0, m, &b, &b11 ); + } + + // Copy B11 to C11, and save. bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); +#if 0 // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -282,12 +295,42 @@ void libblis_test_gemmtrsm_ukr_experiment &b, &bp, &cntx ); +#endif + + // Create the packed objects. Use packmr and packnr as the leading + // dimensions of ap and bp, respectively. + bli_obj_create( datatype, m, k+m, 1, ldap, &ap ); + bli_obj_create( datatype, k+m, n, ldbp, 1, &bp ); + + // Set up the objects for packing. Calling packm_init_pack() does everything + // except checkout a memory pool block and save its address to the obj_t's. + // However, it does overwrite the buffer field of packed object with that of + // the source object. So, we have to save the buffer address that was + // allocated. + void* buf_ap = bli_obj_buffer( ap ); + void* buf_bp = bli_obj_buffer( bp ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_MR, BLIS_KR, &a, &ap, &cntx ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_KR, BLIS_NR, &b, &bp, &cntx ); + bli_obj_set_buffer( buf_ap, ap ); + bli_obj_set_buffer( buf_bp, bp ); + + // Set the diagonal offset of ap. + if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, ap ); } + else { bli_obj_set_diag_offset( 0, ap ); } // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _make_subparts() routine needs this information // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, ap ); + // Pack the data from the source objects. + bli_packm_blk_var1( &a, &ap, &cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); @@ -297,13 +340,17 @@ void libblis_test_gemmtrsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, a11p ); +//bli_printm( "a", &a, "%4.1f", "" ); +//bli_printm( "ap", &ap, "%4.1f", "" ); + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); - // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + // Re-pack (restore) the contents of b to bp. + //bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); @@ -325,10 +372,12 @@ void libblis_test_gemmtrsm_ukr_experiment // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); +#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); +#endif // Free the test objects. bli_obj_free( &a_big ); From e02d3cb84190a345ebe9b32f53db03a1838976b1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Tue, 26 Sep 2017 19:02:53 -0500 Subject: [PATCH 56/64] Fixed a pthread typo in previous commit. Details: - Misnamed 'pthread_mutex_t' type in bli_memsys.c as 'thread_mutex_t'. --- frame/base/bli_memsys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index 8bb8c7263..f60412d40 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -36,7 +36,7 @@ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS -static thread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; #endif static membrk_t global_membrk; From 06e0e6351acb9481225975ad9a4e0b8925336621 Mon Sep 17 00:00:00 2001 From: sthangar Date: Thu, 28 Sep 2017 12:15:36 +0530 Subject: [PATCH 57/64] The inner loop paralleization is turned off by default, the JR and IR loop parameters are set to 1 by default Change-Id: I8c3c2ecbbd636259f6ffb92768ec04148205c3e5 --- config/zen/bli_kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config/zen/bli_kernel.h b/config/zen/bli_kernel.h index 705a6f363..072d6104a 100644 --- a/config/zen/bli_kernel.h +++ b/config/zen/bli_kernel.h @@ -49,6 +49,13 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // +// threading related +// By default it is effective to paralleize the +// outerloops. Setting these macros to 1 will force +// JR and NR inner loops to be not paralleized. +#define BLIS_DEFAULT_MR_THREAD_MAX 1 +#define BLIS_DEFAULT_NR_THREAD_MAX 1 + // sgemm micro-kernel #if 0 From 0f5ce26fc597cda6e8ae93a7526f52eb8cba01e9 Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Mon, 16 Oct 2017 21:07:50 +0530 Subject: [PATCH 58/64] Thread safety: Make the global induced method status array local to thread BLIS retains a global status array for induced methods, and provides APIs to modify this state during runtime. So, one application thread can modify the state, before another starts the corresponding BLIS operation. This patch solves this issue by making the induced method status array local to threads. Change-Id: Iff59b6f473771344054c010b4eda51b7aa4317fe --- frame/include/bli_macro_defs.h | 14 ++++++++++++++ frame/ind/bli_l3_ind.c | 6 +++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index d99be2345..a332554cc 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -64,6 +64,20 @@ #endif +// -- BLIS Thread Local Storage Keyword -- + +// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. +// There is a small risk here as __GNUC__ can also be defined by some other +// compiler (other than ICC and CLANG which we know define it) that +// doesn't support __thread, as __GNUC__ is not quite unique to GCC. +// But the possibility of someone using such non-main-stream compiler +// for building BLIS is low. +#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) + #define BLIS_THREAD_LOCAL __thread +#else + #define BLIS_THREAD_LOCAL +#endif + // -- Boolean values -- #ifndef TRUE diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index e694f5384..cedf40d10 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -60,7 +60,11 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = // // NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2. // -static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = +// BLIS provides APIs to modify this state during runtime. So, one application thread +// can modify the state, before another starts the corresponding BLIS operation. +// This is solved by making the induced method status array local to threads. + +static BLIS_THREAD_LOCAL bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { /* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */ /* c z */ From 4607aac297e55ad540cbe5fffbe02e6b1889c181 Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Mon, 16 Oct 2017 22:06:57 +0530 Subject: [PATCH 59/64] Thread Safety: Move bli_init() before and bli_finalize() after main() BLIS provides APIs to initialize and finalize its global context. One application thread can finalize BLIS, while other threads in the application are stil using BLIS. This issue can be solved by removing bli_finalize() from API. One way to do this is by getting bli_finalize() to execute by default after application exits from main(). GCC supports this behaviour with the help of __attribute__((destructor)) added to the function that need to be executed after main exits. Similarly bli_init() can be made to run before application enters main() so that application need not call it. Change-Id: I7ce6cfa28b384e92c0bdf772f3baea373fd9feac --- frame/base/bli_init.c | 11 ++++++++--- frame/include/bli_macro_defs.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index db598cede..90bc51fa0 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -40,8 +40,10 @@ pthread_mutex_t initialize_mutex = PTHREAD_MUTEX_INITIALIZER; static bool_t bli_is_init = FALSE; - -err_t bli_init( void ) +// If BLIS is built using a compiler that supports __attribute__((constructor)), +// then bli_init() will be executed before the application enters main(). +// In that case there is no need to call bli_init() in the application code. +BLIS_ATTRIB_CTOR err_t bli_init( void ) { err_t r_val = BLIS_FAILURE; @@ -105,7 +107,10 @@ err_t bli_init( void ) return r_val; } -err_t bli_finalize( void ) +// If BLIS is built using a compiler that supports __attribute__((destrutor)), +// then bli_finalize() will be executed after the application exits main(). +// In that case there is no need to call bli_finalize() in the application code. +BLIS_ATTRIB_DTOR err_t bli_finalize( void ) { err_t r_val = BLIS_FAILURE; diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index a332554cc..f5abe7902 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -78,6 +78,34 @@ #define BLIS_THREAD_LOCAL #endif + +// -- BLIS constructor/destructor function attribute -- + +// __attribute__((constructor/destructor)) is supported by GCC only. +// There is a small risk here as __GNUC__ can also be defined by some other +// compiler (other than ICC and CLANG which we know define it) that +// doesn't support this, as __GNUC__ is not quite unique to GCC. +// But the possibility of someone using such non-main-stream compiler +// for building BLIS is low. + +#if defined(__ICC) || defined(__INTEL_COMPILER) + // ICC defines __GNUC__ but doesn't support this + #define BLIS_ATTRIB_CTOR + #define BLIS_ATTRIB_DTOR +#elif defined(__clang__) + // CLANG supports __attribute__, but doesn't mention support for + // constructor/destructor. If we can confirm that CLANG supports + // this attribute, modify it to proper definition + #define BLIS_ATTRIB_CTOR + #define BLIS_ATTRIB_DTOR +#elif defined(__GNUC__) + #define BLIS_ATTRIB_CTOR __attribute__((constructor)) + #define BLIS_ATTRIB_DTOR __attribute__((destructor)) +#else + #define BLIS_ATTRIB_CTOR + #define BLIS_ATTRIB_DTOR +#endif + // -- Boolean values -- #ifndef TRUE From 375342799cbae981c28d831793af588d7951f3f6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 18 Oct 2017 13:41:25 -0500 Subject: [PATCH 60/64] Removed a duplicate bli_avx512_macros.h header. Details: - Removed a duplicate header file that was causing problems during installation for the 'knl' configuration. Thanks to Victor Eijkhout for reporting this issue. --- kernels/x86_64/knl/1m/bli_avx512_macros.h | 1 - 1 file changed, 1 deletion(-) delete mode 120000 kernels/x86_64/knl/1m/bli_avx512_macros.h diff --git a/kernels/x86_64/knl/1m/bli_avx512_macros.h b/kernels/x86_64/knl/1m/bli_avx512_macros.h deleted file mode 120000 index 3b1d1779e..000000000 --- a/kernels/x86_64/knl/1m/bli_avx512_macros.h +++ /dev/null @@ -1 +0,0 @@ -../3/bli_avx512_macros.h \ No newline at end of file From 3eb44f67618b91ae5f5f0aaaba67e38f16042ee4 Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Tue, 24 Oct 2017 16:36:36 +0530 Subject: [PATCH 61/64] Adding __attribute__((constructor/destructor)) for CLANG case. CLANG supports __attribute__, but its documentation doesn't mention support for constructor/destructor. Compiling with clang and testing shows that it does support this. Change-Id: Ie115b20634c26bda475cc09c20960d687fb7050b --- frame/include/bli_macro_defs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index f5abe7902..1162a7e1e 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -93,11 +93,11 @@ #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) - // CLANG supports __attribute__, but doesn't mention support for - // constructor/destructor. If we can confirm that CLANG supports - // this attribute, modify it to proper definition - #define BLIS_ATTRIB_CTOR - #define BLIS_ATTRIB_DTOR + // CLANG supports __attribute__, but its documentation doesn't + // mention support for constructor/destructor. Compiling with + // clang and testing shows that it does support. + #define BLIS_ATTRIB_CTOR __attribute__((constructor)) + #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) From f60c827ba95f452c8454fb914f5564f4895bf644 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 30 Oct 2017 10:04:42 -0500 Subject: [PATCH 62/64] Fix CVECFLAGS for bulldozer config. --- config/bulldozer/make_defs.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 582354e96..df472c292 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -71,7 +71,7 @@ else COPTFLAGS := -O2 -malign-double -funroll-all-loops endif -CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse +CVECFLAGS := -mavx -mfma4 -march=bdver1 -mfpmath=sse CKOPTFLAGS := $(COPTFLAGS) # --- Determine the archiver and related flags --- From ab57b979046479bcda7f83165838a80117c2ad95 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 1 Nov 2017 11:51:41 -0500 Subject: [PATCH 63/64] Revert to default SIMD alignment for bulldozer. Details: - Removed the default-overriding #define of BLIS_SIMD_ALIGN_SIZE set in config/bulldozer/bli_kernel.h. Not sure where this value came from, but it would seem to allow for insufficient starting address alignment for any matrices created via bli_malloc_user(), such as via bli_obj_create(). Thanks to Rene Sitt for reporting the behavior that led us to this bug. - This commit is a manual patch of the same fix made to the 'rt' branch in 8f150f2. --- config/bulldozer/bli_kernel.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/config/bulldozer/bli_kernel.h b/config/bulldozer/bli_kernel.h index d7baa59e8..b750b6da2 100644 --- a/config/bulldozer/bli_kernel.h +++ b/config/bulldozer/bli_kernel.h @@ -38,8 +38,6 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -#define BLIS_SIMD_ALIGN_SIZE 16 - // -- Cache blocksizes -- // From fe71c06e42b072407c83112779055b0afb67173d Mon Sep 17 00:00:00 2001 From: Nisanth M P Date: Wed, 15 Nov 2017 11:11:17 +0530 Subject: [PATCH 64/64] Added AMD copyright line to the changed files in last 3 commits Change-Id: I37d5dbbbe1b199e07529610a5e9cc9e49d067c66 --- frame/base/bli_init.c | 1 + frame/include/bli_macro_defs.h | 1 + frame/ind/bli_l3_ind.c | 1 + 3 files changed, 3 insertions(+) diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 90bc51fa0..b1a234dc3 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 1162a7e1e..f44c6294c 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index cedf40d10..7fd64c2bd 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2017, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are