diff --git a/CHANGELOG b/CHANGELOG index a361ceac3..c9a04cbde 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,18 +1,706 @@ -commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +commit 940a707ac78de975110e17c95765e65b89aa5e10 (HEAD -> master, tag: 0.2.2) +Author: Field G. Van Zee +Date: Tue May 2 16:38:42 2017 -0500 + + Version file update (0.2.2) + +commit d5a5e003ea9b24bb6abf12e88862e8eb61ffb03d (origin/master, origin/HEAD, origin/1m, 1m) +Author: Field G. Van Zee +Date: Tue May 2 15:48:30 2017 -0500 + + Fixed a trsm1m bug that affected right-side cases. + + Details: + - Fixed a bug introduced in 1c732d3 that affected trsm1m_r. The result + was nondeterministic behavior (usually segmentation faults) for certain + problem sizes beyond the 1m instance of kc (e.g. 128 on haswell). The + cause of the bug was my commenting out lines in bli_gemm1m_ukr_ref.c + which explicitly directed the virtual gemm micro-kernel to use temporary + space if the storage preference of the [real domain] gemm ukernel did + not match the storage of the output matrix C. In the context of gemm, + this handling is not needed because agreement between the storage pref + and the matrix is guaranteed by a high-level optimization in BLIS. + However, this optimization is not applied to trsm because the storage + of C is not necessarily the same as the storage of the micro-panels of + B--both of which are updated by the micro-kernel during a trsm + operation. Thus, the guarantee of storage/preference agreement is not + in place for trsm, which means we must handle that case within the + virtual gemm micro-kernel. + - Comment updates and a minor macro change to bli_trsm*_cntx_init() for + 3m1, 4m1a, and 1m. + +commit e80993e71f4d571e9650a8e90ed386e32059eae5 +Merge: a509fbd5 ca3a7924 +Author: Field G. Van Zee +Date: Tue May 2 12:30:28 2017 -0500 + + Merge branch 'master' into 1m + +commit ca3a7924770d6cf203cce4ca9f5482e1d0d4e961 +Author: Field G. Van Zee +Date: Tue May 2 12:09:39 2017 -0500 + + README.md update. + + Details: + - Updated bibtex entries for 4th BLIS paper, and adds entries for 5th + and 6th BLIS papers. + +commit 6e7de6ef84babb273dc5528a9b9d01f0febe394b +Author: Field G. Van Zee +Date: Fri Mar 17 12:10:24 2017 -0500 + + Minor updates to test/3m4m. + + Details: + - Updated initial problem size and increment in Makefile. + - Updated code in test_gemm.c to correctly query kc from context. + +commit f484c6cd4389dc7ae5b972849e12e98ad5bbf9a4 +Author: Field G. Van Zee +Date: Fri Mar 17 12:07:27 2017 -0500 + + Whitespace reformatting to armv8a kernels file. + + Details: + - Updated formatting of function signature/header in + kernels/armv8a/3/bli_gemm_opt_4x4.c. + +commit a509fbd5ac04fafd4e51b43d2f59ca56432dc212 +Merge: 69b4846a 513944e4 +Author: Field G. Van Zee +Date: Tue Feb 21 17:06:16 2017 -0600 + + Merge branch 'master' into 1m + +commit 69b4846ae9adb157c4171b52e159684db2867853 +Author: Field G. Van Zee +Date: Tue Feb 21 15:33:39 2017 -0600 + + Disabled experiment-related 1m code. + + Details: + - Commented out code in frame/ind/oapi/bli_l3_3m4m1m_oapi.c that was + specifically inserted to facilitate the benchmarking of 1m block-panel + and panel-block algorithms. + - Updates to test/3m4m/Makefile, runme.sh script, and test_gemm.c to + reflect changes used/needed during benchmarking. + +commit 513944e4a951d8823b4de161b86ad7a965b4d99b +Merge: 8b462a0e 0e18f68c +Author: Devin Matthews +Date: Mon Feb 20 10:04:33 2017 -0500 + + Merge pull request #118 from devinamatthews/master + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 0e18f68cf12eb9189ba901a20040b1cdae417670 +Author: Devin Matthews +Date: Mon Feb 20 09:03:21 2017 -0600 + + Handle k=0 correctly in KNL dgemm ukernel. + +commit 8b462a0e8c3e9252f0401940849e53cc772256fa +Merge: c362afc5 7d42fc07 +Author: Devin Matthews +Date: Sun Feb 19 23:03:03 2017 -0500 + + Merge pull request #117 from devinamatthews/master + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit 7d42fc0796ef0c010375fd8e59b1240ba41ce4d2 +Author: Devin Matthews +Date: Sun Feb 19 21:10:55 2017 -0500 + + Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. + +commit c362afc525bab4050581d1b0fcea2fe4d582c608 +Author: Field G. Van Zee +Date: Thu Feb 9 11:54:59 2017 -0600 + + Added missing "level-0" BLAS [sd]cabs1_(). + + Details: + - Fixed issue #115 by adding implementations for scabs1_() and dcabs1_() + to the BLAS compatibility layer. Thanks to heroxbd for pointing out + their absence. + +commit 018180c938c32efbeaaf626ba71ec5b780664db1 +Author: Field G. Van Zee +Date: Wed Feb 8 11:20:52 2017 -0600 + + Fixed a minor bug in configure (issue #114). + + Details: + - Fixed a bug in the configure script whereby a non-preferred value for + --enable-threading would cause problems in common.mk vis-a-vis detecting + which threading model was chosen. Thanks to heroxbd for reporting this + issue. + +commit ddf45e71770c55ea4a58ca24ea4913fe5d8beb9b +Merge: a6ab91bc 78e1b16e +Author: Devin Matthews +Date: Fri Jan 27 14:25:40 2017 -0600 + + Merge pull request #113 from devinamatthews/knl_thread_params + + Change default threading parameters for KNL. + +commit 78e1b16e16d589ed31b2e712115ee282097f114d +Author: Devin Matthews +Date: Fri Jan 27 14:22:20 2017 -0600 + + Change default threading parameters for KNL. + +commit 1c732d3ddc4ac0861d3b0e0dd15eb7e071615502 +Author: Field G. Van Zee +Date: Wed Jan 25 16:25:46 2017 -0600 + + Added 1m-specific APIs for bp, pb gemm algorithms. + + Details: + - Defined bli_gemmbp_cntl_create(), bli_gemmpb_cntl_create(), with the + body of bli_gemm_cntl_create() replaced with a call to the former. + - Defined bli_cntl_free_w_thrinfo(), bli_cntl_free_wo_thrinfo(). Now, + bli_cntl_free() can check if the thread parameter is NULL, and if so, + call the latter, and otherwise call the former. + - Defined bli_gemm1mbp_cntx_init(), bli_gemm1mpb_cntx_init(), both in + terms of bli_gemm1mxx_cntx_init(), which behaves the same as + bli_gemm1m_cntx_init() did before, except that an extra bool parameter + (is_pb) is used to support both bp and pb algorithms (including to + support the anti-preference field described below). + - Added support for "anti-preference" in context. The anti_pref field, + when true, will toggle the boolean return value of routines such as + bli_cntx_l3_ukr_eff_prefers_storage_of(), which has the net effect of + causing BLIS to transpose the operation to achieve disagreement (rather + than agreement) between the storage of C and the micro-kernel output + preference. This disagreement is needed for panel-block implementations, + since they induce a transposition of the suboperation immediately before + the macro-kernel is called, which changes the apparent storage of C. For + now, anti-preference is used only with the pb algorithm for 1m (and not + with any other non-1m implementation). + - Defined new functions, + bli_cntx_l3_ukr_eff_prefers_storage_of() + bli_cntx_l3_ukr_eff_dislikes_storage_of() + bli_cntx_l3_nat_ukr_eff_prefers_storage_of() + bli_cntx_l3_nat_ukr_eff_dislikes_storage_of() + which are identical to their non-"eff" (effectively) counterparts except + that they take the anti-preference field of the context into account. + - Explicitly initialize the anti-pref field to FALSE in + bli_gks_cntx_set_l3_nat_ukr_prefs(). + - Added bli_gemm_ker_var1.c, which implements a panel-block macro-kernel + in terms of the existing block-panel macro-kernel _ker_var2(). This + technique requires inducing transposes on all operands and swapping + the A and B. + - Changed bli_obj_induce_trans() macro so that pack-related fields are + also changed to reflect the induced transposition. + - Added a temporary hack to bli_l3_3m4m1m_oapi.c that allows us to easily + specify the 1m algorithm (block-panel or panel-block). + - Renamed the following cntx_t-related macros: + bli_cntx_get_pack_schema_a() -> bli_cntx_get_pack_schema_a_block() + bli_cntx_get_pack_schema_b() -> bli_cntx_get_pack_schema_b_panel() + bli_cntx_get_pack_schema_c() -> bli_cntx_get_pack_schema_c_panel() + and updated all instantiations. Also updated the field names in the + cntx_t struct. + - Comment updates. + +commit a6ab91bc61432490fadf18d596de4589645f37dd +Merge: 145a551d 7f31a630 +Author: Field G. Van Zee +Date: Wed Nov 30 09:26:58 2016 -0600 + + Merge pull request #111 from figual/master + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 7f31a6307b7bd35f913c895947552c3a176f789b +Author: Francisco Igual +Date: Sun Nov 27 14:40:47 2016 +0100 + + Fixed missing cntx argument in ARMv8 microkernels. + +commit 126482a3b609b9ad7026ba348f6c4bf6a29be8a1 +Author: Field G. Van Zee +Date: Fri Nov 25 18:29:49 2016 -0600 + + Implemented the 1m method. + + Details: + - Implemented the 1m method for inducing complex domain matrix + multiplication. 1m support has been added to all level-3 operations, + including trsm, and is now the default induced method when native + complex domain gemm microkernels are omitted from the configuration. + - Updated _cntx_init() operations to take a datatype parameter. This was + needed for the corresponding function for 1m (because 1m requires us + to choose between column-oriented or row-oriented execution, which + requires us to query the context for the storage preference of the + gemm microkernel, which requires knowing the datatype) but I decided + that it made sense for consistency to add the parameter to all other + cntx initialization functions as well, even though those functions + don't use the parameter. + - Updated bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs() to take + a second scalar for each blocksize entry. The semantic meaning of the + two scalars now is that the first will scale the default blocksize + while the second will scale the maximum blocksize. This allows scaling + the two independently, and was needed to support 1m, which requires + scaling for a register blocksize but not the register storage + blocksize (ie: "packdim") analogue. + - Deprecated bli_blksz_reduce_dt_to() and defined two new functions, + bli_blksz_reduce_def_to() and bli_blksz_reduce_max_to(), for reducing + default and maximum blocksizes to some desired blocksize multiple. + These functions are needed in the updated definitions of + bli_cntx_set_blkszs() and bli_gks_cntx_set_blkszs(). + - Added support for the 1e and 1r packing schemas to packm, including + 1e/1r packing kernels. + - Added a minor optimization to bli_gemm_ker_var2() that allows, under + certain circumstances (specifically, real domain beta and row- or + column-stored matrix C), the real domain macrokernel and microkernel + to be called directly, rather than using the virtual microkernel + via the complex domain macrokernel, which carries a slight additional + amount of overhead. + - Added 1m support to the testsuite. + - Added 1m support to Makefile and runme.sh in test/3m4m. Also simplified + some code in test_gemm.c driver. + +commit 145a551d524ae5492667a05fc248923d922df850 +Author: Field G. Van Zee +Date: Wed Nov 23 17:59:06 2016 -0600 + + Switched to simpler trsm_r implementation. + + Details: + - Disabled the implementation of trsm_r that allows the right-hand matrix + B to be trianglar, and switched to the implementation that simply + transposes the operation (and thus the storage of C) in order to recast + the operation as trsm_l. This avoids the need to use trsm_rl and trsm_ru + macrokernels, which require an awkward swapping of MR and NR. For now, + the support for trsm_r macrokernels, via separate control trees, remains. + - Modified bli_config_macro_defs.h so that BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + is defined by default. This is mostly a safety precaution in case someone + tries to switch back to the previous trsm_r implementation, but also + serves as a convenience on some systems where one does not naturally + choose blocksizes in a way that satisfies MC % NR = 0 and NC % MR = 0. + +commit b3e58ee30307cf1e11529f2113acb9abbeda25af +Author: Field G. Van Zee +Date: Wed Nov 23 17:58:26 2016 -0600 + + Reimplemented 4x12 haswell ukernels (real only). + + Details: + - Replaced permutation-based implementations in bli_gemm_asm_d4x12.c, which + defines 4x24 single real and 4x12 double real gemm microkernels, with + broadcast-based implementations. (The previous microkernel file has been + moved to an 'old' subdirectory.) + +commit bdc0a264d2fb5940bfd09298b1de823674a39053 +Author: Field G. Van Zee +Date: Wed Nov 16 14:13:08 2016 -0600 + + Adjusted stride selection of ct in macrokernels. + + Details: + - Updated the changes introduced in 618f433 so that the strides of the + temporary microtile ct used in the macrokernels is determined based + on the storage preference of the microkernel (via the new functions + below), rather than the strides of c. In almost all cases, presently, + this change results in no net effect, as a high-level optimization + in the _front() functions aligns the storage of c to that of the + microkernel's preference. However, I encountered some cases where + this is not always the case in some development code that has yet + to be committed, and therefore I'm generalizing the framework code + in advance. + - Defined two new functions in bli_cntx.c: + bli_cntx_l3_ukr_prefers_rows_dt() + bli_cntx_l3_ukr_prefers_cols_dt() + which return bool_t's based on the current micro-kernel's storage + preferences. For induced methods, the preference of the underlying + real domain microkernel is returned. + - Updated definition of bli_cntx_l3_ukr_dislikes_storage_of(), and + by proxy bli_cntx_l3_ukr_prefers_storage_of(), to be in terms of + the above functions, rather than querying the preferences of the + native microkernel directly (which did the wrong thing for induced + methods). + +commit 031978d2647cf08316858baf29c84ebba9c3133e +Author: Field G. Van Zee +Date: Wed Nov 16 14:04:33 2016 -0600 + + Fixed inactive trsm_r blocksize constraint code. + + Details: + - Changed a cpp macro that was meant to prevent using certain trsm_r code + if BLIS_RELAX_MCNR_NCMR_CONSTRAINTS was defined. It was actually coded + incorrectly at first. I've now fixed its location and changed its + consequence to a compile-time #error message. + +commit 6b5a4032d2e3ed29a272c7f738b7e3ed6657e556 +Merge: 3b524a08 a8220e3a +Author: Field G. Van Zee +Date: Thu Nov 10 15:28:24 2016 -0600 + + Merge pull request #109 from devinamatthews/omp_num_threads + + Add automatic loop thread assignment. + +commit a8220e3a86433b5d76789e32ea7ca014a11b6d17 +Author: Devin Matthews +Date: Thu Nov 10 14:19:34 2016 -0600 + + - Fix typo in bli_cntx.c + - Bump BLIS_DEFAULT_NR_THREAD_MAX to 4 + +commit c05b3862f6241486442b313eff0c8bee7b5e1274 +Author: Devin Matthews +Date: Fri Nov 4 15:48:02 2016 -0500 + + Add automatic loop thread assignment. + + - Number of threads is determined by BLIS_NUM_THREADS or OMP_NUM_THREADS, but can be overridden by BLIS_XX_NT as before. + - Threads are assigned to loops (ic, jc, ir, and jc) automatically by weighted partitioning and heuristics, both of which are tunable via bli_kernel.h. + - All level-3 BLAS covered. + +commit 3b524a08e3fb8380e7b8b2ba835312c51a331570 +Author: Field G. Van Zee +Date: Wed Nov 2 17:45:18 2016 -0500 + + Consolidated 3m1/4m1 gemmtrsm, trsm ukernel code. + + Details: + - Consolidated the macros that define the lower and upper versions of the + gemmtrsm microkernels into a single macro that is instantiated twice. + Did this for both 3m1 and 4m1 microkernels. + - Consolidated lower and upper versions of the trsm microkernels for 3m1 + and 4m1 into single files (each). + +commit ead231aca635deb3db270f118454e4222c627f31 +Merge: d25e6f8b 62987f60 +Author: Field G. Van Zee +Date: Wed Nov 2 13:03:50 2016 -0500 + + Merge pull request #108 from devinamatthews/patch-2 + + Update .travis.yml with additional tests + +commit 62987f60a6a6ff0a75b31d0404f493593ce35ccc +Author: Devin Matthews +Date: Wed Nov 2 11:20:37 2016 -0500 + + Allow KNL to fail + +commit 8f9010542c751ae3cbfe6121cb011d8985c1e00d +Author: Devin Matthews +Date: Wed Nov 2 11:18:32 2016 -0500 + + Fix some problems with OSX builds: + + - Update CPU detection for Intel archs (esp. Skylake) + - Allow clang for the reference config + +commit d25e6f8b63c57f30b8a67dffbf4995977cf9f235 +Author: Field G. Van Zee +Date: Tue Nov 1 14:35:15 2016 -0500 + + Can disable trsm_r-specific blocksize constraints. + + Details: + - Added cpp guards around the constraints in bli_kernel_macro_defs.h + that enforce MC % NR = 0 and NC % MR = 0. These constraints are ONLY + needed when handling right-side trsm by allowing the matrix on the + right (matrix B) to be triangular, because it involves swapping + register, but not cache, blocksizes (packing A by NR and B by MR) + and then swapping the operands to gemmtrsm just before that kernel + is called. It may be useful to disable these constraints if, for + example, the developer wishes to test the configuration with + a different set of cache blocksizes where only MC % MR = 0 and + NC % NR = 0 are enforced. + - In summary, #defining BLIS_RELAX_MCNR_NCMR_CONSTRAINTS will bypass + the enforcement of MC % NR = 0 and NC % MR = 0. + +commit 1a67e3688edb073a9d44c160e7b0798e08796b8a +Author: Devin Matthews +Date: Tue Nov 1 13:53:18 2016 -0500 + + Bogus commit + + Need to trigger another Travis build. + +commit 2cd82d67b372cad1bed50cfd99e524f1f40b4e24 +Author: Devin Matthews +Date: Tue Nov 1 13:25:50 2016 -0500 + + Some fixes for .travis.yml + + - Switch to gcc-5 to support knl + - Don't run tests in parallel -- it is super slow. + - Use clang on OSX since gcc is only a zombie husk. + +commit a3db4e6bdfe745083acf704ab0f51f74ea869538 +Author: Devin Matthews +Date: Tue Nov 1 10:33:18 2016 -0500 + + Update .travis.yml with additional tests + + - Test knl configuration (without running of course). + - Test openmp and pthreads threading for auto configuration with 4 threads. + - Test auto configuration with and without pthreads on OSX. + - Also, run make in parallel. + + I don't know how the `addons:` section works on OSX; hopefully it is just ignored. + +commit 8a11a2174a1a5b9426f13bbc5338dc86ab138cdd +Author: Field G. Van Zee +Date: Mon Oct 31 19:07:55 2016 -0500 + + Updates to non-default haswell microkernels. + + Details: + - Updated s and d microkernels in bli_gemm_asm_d8x6.c to relax alignment + constraints. + - Added missing c and z microkernels, which are based on the corresponding + kernels in the d6x8 set. + - This completes the d8x6 set (which may be used for situations when it + is desirable to have a microkernel with a column preference). + +commit 618f4331eba209803ecab99747872eceb1b5f091 +Author: Field G. Van Zee +Date: Mon Oct 31 14:40:51 2016 -0500 + + Align strides of ct in macrokernels to that of c. + + Details: + - Previously, rs_ct and cs_ct, the strides of the temporary microtile used + primarily in the macrokernels' edge case handling, were unconditionally + set to 1 and MR, respectively. However, Devin Matthews noted that this + ought to be changed so that the strides of ct were in agreement with the + strides of C. (That is, if C was row-stored, then ct should be accessed + as by rows as well.) The implicit assumption is that the strides of C + have already been adjusted, via induced transposition, if the storage + preference of the microkernel is at odds with the storage of C. So, if + the microkernel prefers row storage, the macrokernel's interior cases + would present row-stored (ideal) microkernel subproblems to the + microkernel, but for edge cases, it would still see column-stored + subproblems (not ideal). This commit fixes this issue. Thanks to Devin + for his suggestion. + +commit 630391002325a589063aec2ab0a7d89ef2e178c0 +Merge: 956b3edf 216206c1 +Author: Field G. Van Zee +Date: Tue Oct 25 19:34:51 2016 -0500 + + Merge pull request #105 from devinamatthews/knl + + Support for Intel Knight's Landing. + +commit 216206c1d328a865c2192e35a4df6e9aff79a85b +Author: Devin Matthews +Date: Tue Oct 25 13:56:18 2016 -0500 + + Fix up for merge to master. + +commit 11eb7957abbcdf02d5e312898e094260eadb1209 +Merge: cd5b6681 956b3edf +Author: Devin Matthews +Date: Tue Oct 25 13:51:07 2016 -0500 + + Merge branch 'master' into knl + + # Conflicts: + # frame/thread/bli_thread.h + +commit cd5b6681838899283cd94e5427dfda206e7fbabe +Author: Devin Matthews +Date: Tue Oct 25 13:49:27 2016 -0500 + + Don't use %rbp in KNL packing kernels. + +commit 956b3edf8eb09480f31f2e861c1b10f9ecbb2e52 +Merge: b7e41d71 0662a3c1 +Author: Field G. Van Zee +Date: Tue Oct 25 13:02:57 2016 -0500 + + Merge pull request #104 from devinamatthews/misspellings + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit 0662a3c1b1f4644a86bf8e5073d1391808c91b4a +Author: Devin Matthews +Date: Tue Oct 25 12:42:44 2016 -0500 + + Add flexible options for thread model (pthread/posix for pthreads etc.). + +commit b7e41d71b07d2af6d22d632c70e0c5f7ce46852c +Merge: 4bd905bd 5117d444 +Author: Field G. Van Zee +Date: Mon Oct 24 16:47:46 2016 -0500 + + Merge pull request #103 from devinamatthews/patch-1 + + Change .align to .p2align in Bulldozer ukernels. + +commit 5117d444f7f3a2bc327f067926eaf2398212edda +Author: Devin Matthews +Date: Mon Oct 24 16:20:47 2016 -0500 + + Change .align to .p2align in Bulldozer ukernels + + Apparently OSX doesn't allow .align directives for >16B, so I've changed these to their .p2align counterparts. + +commit 4bd905bd4597e0ad7bedf31e25e779d3e2dfda29 +Merge: 936d5fdc 7f32dd57 +Author: Field G. Van Zee +Date: Fri Oct 21 14:48:44 2016 -0500 + + Merge pull request #93 from ShadenSmith/config_check + + Adds sanity check to configuration choice. + +commit 936d5fdc26c6c4dab199a8d11fde948975cfa1d6 +Author: Field G. Van Zee +Date: Fri Oct 21 14:34:27 2016 -0500 + + Fixed multithreading compilation bug in 970745a. + + Details: + - Moved the definition of the cpp macro BLIS_ENABLE_MULTITHREADING + from bli_thread.h to bli_config_macro_defs.h. Also moved the + sanity check that OpenMP and POSIX threads are not both enabled. + - Thanks to Krzysztof Drewniak for reporting this bug. + +commit 8feb0f85a674e84bec2417486e3bcea584b14c04 +Author: Field G. Van Zee +Date: Wed Oct 19 16:05:41 2016 -0500 + + Removed auto-prototyping of malloc()/free() substitutes. + + Details: + - Removed the header file, bli_malloc_prototypes.h, which automatically + generated prototypes for the functions specified by the following + cpp macros: + BLIS_MALLOC_INTL + BLIS_FREE_INTL + BLIS_MALLOC_POOL + BLIS_FREE_POOL + BLIS_MALLOC_USER + BLIS_FREE_USER + These prototypes were originally provided primarily as a convenience + to those developers who specified their own malloc()/free() substitutes + for one or more of the following. However, we generated these prototypes + regardless, even when the default values (malloc and free) of the + macros above were used. A problem arose under certain circumstances + (e.g., gcc in C++ mode on Linux with glibc) when including blis.h that + stemmed from the "throw" specification which was added to the glibc's + malloc() prototype, resulting in a prototype mismatch. Therefore, going + forward, developers who specify their own custom malloc()/free() + substitutes must also prototype those substitutes via bli_kernel.h. + Thanks to Krzysztof Drewniak for reporting this bug, and Devin Matthews + for researching the nature and potential solutions. + +commit 970745a5fc7c29de3e202988e5eb104fabca4fdc +Author: Field G. Van Zee +Date: Wed Oct 19 15:58:03 2016 -0500 + + Reorganized typedefs to avoid compiler warnings. + + Details: + - Relocated membrk_t definition from bli_membrk.h to bli_type_defs.h. + - Moved #include of bli_malloc.h from blis.h to bli_type_defs.h. + - Removed standalone mtx_t and mutex_t typedefs in bli_type_defs.h. + - Moved #include of bli_mutex.h from bli_thread.h to bli_typedefs.h. + - The redundant typedefs of membrk_t and mtx_t caused a warning on some C + compilers. Thanks to Tyler Smith for reporting this issue. + +commit 28b2af8a71133ce68774e153b6e05afb05affba8 +Author: Field G. Van Zee +Date: Thu Oct 13 14:50:08 2016 -0500 + + Added disabled code to print thrinfo_t structures. + + Details: + - Added cpp-guarded code to bli_thrcomm_openmp.c that allows a curious + developer to print the contents of the thrinfo_t structures of each + thread, for verification purposes or just to study the way thread + information and communicators are used in BLIS. + - Enabled some previously-disabled code in bli_l3_thrinfo.c for freeing + an array of thrinfo_t* values that is used in the new, cpp-guarde code + mentioned above. + - Removed some old commented lines from bli_gemm_front.c. + +commit 11eed3f683d09e65f721567b346b0f733bff9a64 +Author: Field G. Van Zee +Date: Thu Oct 13 14:23:23 2016 -0500 + + Fixed a configure -t omp/openmp bug from fd04869. + + Details: + - Forgot to update certain occurrences of "omp" in common.mk during + commit fd04869, which changed the preferred configure option string + for enabling OpenMP from "omp" to "openmp". + +commit 9cda6057eaa16a24ac8785a9fa167df6c9edba44 +Author: Field G. Van Zee +Date: Tue Oct 11 13:21:26 2016 -0500 + + Removed previously renamed/old files. + + Details: + - Removed frame/base/bli_mem.c and frame/include/bli_auxinfo_macro_defs.h, + both of which were renamed/removed in 701b9aa. For some reason, these + files survived when the compose branch was merged back into master. + (Clearly, git's merging algorithm is not perfect.) + - Removed frame/base/bli_mem.c.prev (an artifact of the long-ago changed + memory allocator that I was keeping around for no particular reason). + +commit 22377abd84b9e560ffe1c4e4d284eb443ddb7133 +Author: Field G. Van Zee +Date: Mon Oct 10 13:43:56 2016 -0500 + + Fixed bli_gemm() segfault on empty C matrices. + + Details: + - Fixed a bug that would manifest in the form of a segmentation fault + in bli_cntl_free() when calling any level-3 operation on an empty + output matrix (ie: m = n = 0). Specifically, the code previously + assumed that the entire control tree was built prior to it being + freed. However, if the level-3 operation performs an early exit, the + control tree will be incomplete, and this scenario is now handled. + Thanks to Elmar Peise for reporting this bug. + +commit 0b571cd94d9b175331c9453258a6b1389a718ae8 +Author: Field G. Van Zee +Date: Thu Oct 6 14:48:15 2016 -0500 + + Fixed segfault in bli_free_align() for NULL ptrs. + + Details: + - Fixed a bug in bli_free_align() caused by failing to handle NULL pointers + up-front, which led to performing pointer arithmetic on NULL pointers in + order to free the address immediately before the pointer. Thanks to Devin + Matthews for reporting this bug. + +commit 4fb9b4ef2e4cf2626a6e000a41628fb823f16da8 +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:35 2016 -0500 + + CHANGELOG update (0.2.1) + +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (tag: 0.2.1) Author: Field G. Van Zee Date: Wed Oct 5 14:41:34 2016 -0500 Version file update (0.2.1) -commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) -Merge: 8696987 6f71cd3 +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 +Merge: 86969873 6f71cd34 Author: Field G. Van Zee Date: Wed Oct 5 13:35:01 2016 -0500 Merge branch 'compose' -commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) -Merge: c0630c4 8d55033 +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose, compose) +Merge: c0630c40 8d55033c Author: Field G. Van Zee Date: Tue Oct 4 15:53:46 2016 -0500 @@ -92,14 +780,20 @@ Date: Tue Sep 27 14:14:11 2016 -0500 should be considered deprecated. commit 9424af87209e4e435e2e742430945152690170b0 -Merge: efa7341 c0630c4 +Merge: efa7341d c0630c40 Author: Field G. Van Zee Date: Tue Sep 27 12:51:08 2016 -0500 Merge branch 'compose' +commit 7f32dd57c6bd41c0704341752842277dd6a4c8eb +Author: Shaden Smith +Date: Sat Sep 17 11:33:57 2016 -0500 + + Adds sanity check to configuration choice. + commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e -Merge: 121c39d e1453f6 +Merge: 121c39d4 e1453f68 Author: Field G. Van Zee Date: Fri Sep 16 11:01:57 2016 -0500 @@ -113,7 +807,7 @@ Date: Fri Sep 16 09:29:28 2016 -0500 Fixes broken URL in README.md -commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +commit c0630c4024b08750043a2942a3e8a037aa6b6259 Author: Field G. Van Zee Date: Mon Sep 12 13:59:02 2016 -0500 @@ -125,7 +819,7 @@ Date: Mon Sep 12 13:59:02 2016 -0500 - Minor changes to frame/thread/bli_thrinfo.h. commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 -Merge: 3550981 121c39d +Merge: 35509818 121c39d4 Author: Field G. Van Zee Date: Tue Sep 6 15:47:13 2016 -0500 @@ -287,7 +981,7 @@ Date: Fri Aug 26 19:04:45 2016 -0500 implementations can slow down the testsuite considerably. commit 73517f522b69de429dd7f3df60a70c068149ab28 -Merge: c6f5c21 50293da +Merge: c6f5c215 50293da3 Author: Field G. Van Zee Date: Tue Aug 23 13:46:59 2016 -0500 @@ -315,7 +1009,7 @@ Date: Tue Aug 23 13:38:36 2016 -0500 which requires "0" or "1". commit c6f5c215ee793d03ea834469fc2adc53feaffc42 -Merge: d52cb76 16a4c7a +Merge: d52cb767 16a4c7a8 Author: Field G. Van Zee Date: Mon Aug 22 17:33:02 2016 -0500 @@ -333,8 +1027,48 @@ Date: Fri Aug 19 11:38:36 2016 -0500 to type mismatch, and in the case of pthreads, a missing function argument. The bugs are fairly recent, introduced in a017062. +commit c8e4ef93953ba2b79fb7e0973c08469c0e28a2cd +Author: Devin Matthews +Date: Wed Aug 3 16:13:03 2016 -0500 + + Add prefetchw to 30x8 kernel. + +commit 4b5a2f3d6e7ffeb5cc2be8448554f5c2083ad68f +Merge: 380736bf 9f52a587 +Author: Devin Matthews +Date: Wed Aug 3 16:09:51 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + + # Conflicts: + # kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c + +commit 380736bfe955efbdd7274c90b6fd635688e83bc4 +Author: Devin Matthews +Date: Wed Aug 3 16:08:28 2016 -0500 + + Add (new) 30x8 KNL kernel and fix non-scatter prefetch bug. + +commit 9f52a587dee855daa73c194e41b6951416544e9a +Author: Devin Matthews +Date: Wed Aug 3 16:03:53 2016 -0500 + + Try prefetchw[t1] instead of regular prefetch for C. + +commit 8945a1512d366bc6a8a85718d12cbf5de6f2898b +Author: Devin Matthews +Date: Wed Aug 3 11:28:24 2016 -0500 + + This version gets ~1550 GFLOPs on KNL wuth 16x4. + +commit 6ce4c022ebdea00c2b951090e3c2e9e88735b9ce +Author: Devin Matthews +Date: Wed Jul 27 16:26:36 2016 -0500 + + Switch back to 24x8. I could only squeeze 24.5GFLOP out of 8x24, and scalability is not improved. + commit d52cb7671509592a8078729477b40b60380518a2 -Merge: 95abea4 c31b1e7 +Merge: 95abea46 c31b1e7b Author: Field G. Van Zee Date: Wed Jul 27 16:04:55 2016 -0500 @@ -357,8 +1091,87 @@ Date: Wed Jul 27 15:58:07 2016 -0500 - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). - Minor update (vis-a-vis contexts) to driver code in test/3m4m. +commit b8f2b55532849d45d379afbdd05a52ff6100800d +Author: Devin Matthews +Date: Wed Jul 27 15:22:55 2016 -0500 + + Try an 8x24 kernel for the hell of it. + +commit 7ede5863ae3567f7c0852efc2d5cd649ca19e0f3 +Author: Devin Matthews +Date: Wed Jul 27 13:41:27 2016 -0600 + + Allocate pack buffer on MCDRAM for KNL. + +commit ad89ed2e829c7b261d8ba0998a3cb83ad576ee04 +Merge: 2c9de740 81e2b05f +Author: Devin Matthews +Date: Wed Jul 27 11:45:40 2016 -0500 + + Merge branch 'knl' of github.com:devinamatthews/blis into knl + +commit 2c9de740edb66c4692c200731763bbd1d3171ccb +Author: Devin Matthews +Date: Wed Jul 27 11:44:54 2016 -0500 + + This version gets ~26GF on one core. + +commit 81e2b05f31bca4e1e1676e7b533d1868d9f9be33 +Author: Devin Matthews +Date: Wed Jul 27 11:39:05 2016 -0500 + + Add optimized packing kernels for KNL. + +commit a7d8ca97b8d835c32d90ff20a565c82733f014a8 +Author: Devin Matthews +Date: Mon Jul 25 15:15:13 2016 -0500 + + All fixed. + +commit 963d0393b023f4134bb0c682923faf9964c0e645 +Author: Devin Matthews +Date: Mon Jul 25 14:40:53 2016 -0500 + + Add 24xk pack kernel. + +commit 117b76739afba481768897d2580f8365d3345417 +Author: Devin Matthews +Date: Mon Jul 25 13:53:07 2016 -0500 + + In the midst of debugging. + +commit 8c0a4fd1d3535d608a9a309a61ffee0a73c3646f +Author: Devin Matthews +Date: Mon Jul 25 13:09:24 2016 -0500 + + Fix some row/column confusion. + +commit c44f9f96930312125b15e64c326ab5ab5cc02633 +Author: Devin Matthews +Date: Mon Jul 25 12:02:24 2016 -0500 + + Simplify displacements -- clang assembler was badly botching EVEX compressed displacements giving false alarms for instruction length. + +commit e0cce177cc1b47ec9f11ac0556241feaa3564df1 +Author: Devin Matthews +Date: Mon Jul 25 10:02:25 2016 -0500 + + Minor fixes for 8x24 KNL kernel. + +commit 65735bbedf75784c48bd11e05b3fdc98fc66b4bc +Author: Devin Matthews +Date: Sun Jul 24 21:50:32 2016 -0500 + + Switch to 24x8 kernel, unrolled by 16. + +commit 45d5dc97177117220bd9dd0abf85aafc185acad1 +Author: Devin Matthews +Date: Sun Jul 24 14:25:26 2016 -0500 + + Add 24x8 "KNC-style" kernel for KNL. + commit 95abea46f86816fddfc9ff0abfa52880801461be -Merge: d0dfe5b a017062 +Merge: d0dfe5b5 a017062f Author: Field G. Van Zee Date: Sat Jul 23 15:38:33 2016 -0500 @@ -396,8 +1209,39 @@ Date: Fri Jul 22 17:02:59 2016 -0500 single-threaded execution. This new API is employed within functions such as bli_membrk_acquire_[mv]() and bli_membrk_release(). +commit 8ff2e069c48c12fd06b9c48c6b3aeb4ea9b0e6e1 +Author: Devin Matthews +Date: Fri Jul 22 16:22:26 2016 -0500 + + Add 4x unrolled variant for KNL microkernel. + +commit 9cb2ed9b0c25f31a22c1c9719b062fa665ad7adf +Author: Devin Matthews +Date: Fri Jul 22 16:10:30 2016 -0500 + + Git rid of one RBX update. + +commit 451bde076f0320d60cd2475cfb048ac4a2b798bb +Author: Devin Matthews +Date: Fri Jul 22 15:43:00 2016 -0500 + + Add some more knobs to twiddle for KNL microkernel. + +commit 8c6e621c099521e7a4d87e007bb8224faa5f33a3 +Author: Devin Matthews +Date: Fri Jul 22 15:05:15 2016 -0500 + + Make knl conform to new kernel dir structure. + +commit ce7214c6618d6f22f4ce2ee452336236916d1f30 +Merge: 119d0399 ce59f811 +Author: Devin Matthews +Date: Fri Jul 22 14:59:53 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + commit ce59f81108ec9aea918a7e77030da8acfdd397ce -Merge: ff41153 707a2b7 +Merge: ff41153f 707a2b7f Author: Field G. Van Zee Date: Fri Jul 22 14:48:14 2016 -0500 @@ -412,7 +1256,7 @@ Date: Fri Jul 22 13:49:44 2016 -0500 Somehow forgot the most important microkernel. commit 47ec045056351ac4f0791c071fa0daaa81699c8c -Merge: 08f1d6b ff41153 +Merge: 08f1d6b6 ff41153f Author: Devin Matthews Date: Fri Jul 22 13:45:23 2016 -0500 @@ -425,7 +1269,7 @@ Date: Fri Jul 22 13:44:37 2016 -0500 Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 -Merge: f9214ce e0d2fa0 +Merge: f9214ced e0d2fa0d Author: Field G. Van Zee Date: Fri Jul 22 13:21:03 2016 -0500 @@ -440,7 +1284,7 @@ Date: Fri Jul 22 12:56:51 2016 -0500 Relax alignment restrictions for haswell sgemm. commit f9214ced97392861f5a0ea72abfcf6f41faf674c -Merge: 413d62a 08666ea +Merge: 413d62ac 08666eaa Author: Field G. Van Zee Date: Fri Jul 22 12:16:39 2016 -0500 @@ -460,8 +1304,26 @@ Date: Fri Jul 22 11:07:34 2016 -0500 Change -openmp to -fopenmp for icc. +commit 119d0399428905053265f3aca1cc8cc1fde3b363 +Author: Devin Matthews +Date: Fri Jul 22 10:23:31 2016 -0500 + + Add 8x24 KNL kernel. + +commit b58cda9eba0c1e175460aae109baf792d29ba5bf +Merge: 318f063d 413d62ac +Author: Devin Matthews +Date: Tue Jul 19 14:09:09 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + + # Conflicts: + # frame/base/bli_threading.h + # frame/include/blis.h + # frame/thread/bli_thread.c + commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 -Merge: 31def12 413d62a +Merge: 31def12e 413d62ac Author: Field G. Van Zee Date: Thu Jul 14 11:01:06 2016 -0500 @@ -559,6 +1421,12 @@ Date: Fri Jun 17 14:08:35 2016 -0500 but possible divide-by-zero. - Updated function signature and prototype formatting in testsuite. +commit 318f063dcbd8b594969e401bc99146d24b01066a +Author: Devin Matthews +Date: Wed Jun 8 17:46:50 2016 -0500 + + Add new KNL microkernel derived from Haswell. + commit 096895c5d538a7f8817603d7cf28c52e99340def Author: Field G. Van Zee Date: Mon Jun 6 13:32:04 2016 -0500 @@ -592,7 +1460,7 @@ Date: Mon Jun 6 13:32:04 2016 -0500 in the wrong order, which was recently fixed. commit 232530e88ff99f37abcae5b6fb5319a9a375a45f -Merge: 4bcabd1 eef37f8 +Merge: 4bcabd1b eef37f8b Author: Tyler Michael Smith Date: Wed Jun 1 15:14:10 2016 -0500 @@ -700,6 +1568,18 @@ Date: Tue May 17 15:20:16 2016 -0500 store the unrolled 30xk kernel in the array for use (on knc, for example). Note: This should have been done a long time ago. +commit e3bd5ca64ae7c190ba689396c0de687b829a11fe +Author: Devin Matthews +Date: Thu May 12 20:54:13 2016 -0500 + + Fix SIMD definitions in KNL config, and a couple of fixes to C update. + +commit 4fe02e3d497995d94d34d3fcf5af895084cfc8b9 +Author: Devin Matthews +Date: Thu May 12 20:53:58 2016 -0500 + + Move bli_kernel.h before bli_threading.h in order of inclusion in blis.h. + commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 Author: Field G. Van Zee Date: Wed May 11 16:09:49 2016 -0500 @@ -727,7 +1607,7 @@ Date: Wed May 11 16:02:30 2016 -0500 #includes an "f2c.h" header. commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 -Merge: 4dcd37e 7c604e1 +Merge: 4dcd37eb 7c604e1c Author: Tyler Michael Smith Date: Wed May 11 10:47:11 2016 -0500 @@ -741,14 +1621,28 @@ Date: Tue May 10 16:28:59 2016 -0500 fixing knc simd align size +commit 619dee0daec3474b4e5a55df90a61aabcae194f2 +Merge: b790b3d9 7c604e1c +Author: Devin Matthews +Date: Tue May 10 12:13:24 2016 -0500 + + Merge branch 'move_simd_defs' into knl + commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 Author: Devin Matthews Date: Tue May 10 12:11:55 2016 -0500 Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. +commit b790b3d9e1820f3b691676de48c291cae083452d +Merge: 4f8c05c9 a7be2d28 +Author: Devin Matthews +Date: Tue May 10 11:49:47 2016 -0500 + + Merge branch 'master' into knl + commit a7be2d28e8930b154d0da1d6929b54a96e210af6 -Merge: 97b512e 4b1e55e +Merge: 97b512ef 4b1e55ed Author: Field G. Van Zee Date: Tue May 10 11:48:51 2016 -0500 @@ -840,7 +1734,7 @@ Date: Wed Apr 27 14:13:46 2016 -0500 bdbda6e, to tabs. commit 4ea419c72c789825e1f93a1eee88219bbf873930 -Merge: f1e9be2 bdbda6e +Merge: f1e9be2a bdbda6e6 Author: Field G. Van Zee Date: Tue Apr 26 12:50:45 2016 -0500 @@ -870,7 +1764,7 @@ Date: Fri Apr 22 15:34:02 2016 -0500 in my local working copy for longer than I can remember. commit aa0bceec277938328dabeb744680623f24fb0b61 -Merge: 4136553 e2784b4 +Merge: 4136553f e2784b4c Author: Field G. Van Zee Date: Fri Apr 22 12:01:31 2016 -0500 @@ -890,8 +1784,14 @@ Date: Fri Apr 22 11:53:53 2016 -0500 - Changed the definition of bli_cntx_obj_clear() so that the clearing occurs via a single call to memset(). +commit 4f8c05c9e2ef4cbb82b35a3ebf1f0a0ac665830e +Author: Devin Matthews +Date: Thu Apr 21 10:00:59 2016 -0500 + + Rearrange KNL dgemm kernel again to streamline usage of ymm register. sgemm and dgemm now both working with Intel SDE. + commit e2784b4c921f706e756df3e146e20a4cb63f53e3 -Merge: dd0ab1d a9b6c3a +Merge: dd0ab1d9 a9b6c3ab Author: Field G. Van Zee Date: Wed Apr 20 18:34:09 2016 -0500 @@ -900,7 +1800,7 @@ Date: Wed Apr 20 18:34:09 2016 -0500 Change CBLAS integer type to f77_int commit a9b6c3abda6222a8b240361643932e83cf726c4f -Merge: e4c54c8 dd0ab1d +Merge: e4c54c81 dd0ab1d9 Author: Devin Matthews Date: Wed Apr 20 16:00:10 2016 -0500 @@ -927,8 +1827,14 @@ Date: Wed Apr 20 14:38:23 2016 -0500 added equivalent cpp query macros to bli_cntx.h. - Added 'bli_config.h' to .gitignore. +commit 7193230f7d35edbd1d2f77842a613971f1603463 +Author: Devin Matthews +Date: Wed Apr 20 09:37:30 2016 -0500 + + Work around missing VPMULLQ on KNL. + commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb -Merge: eb2f18e 0e1a982 +Merge: eb2f18e4 0e1a9821 Author: Field G. Van Zee Date: Tue Apr 19 15:04:33 2016 -0500 @@ -936,6 +1842,12 @@ Date: Tue Apr 19 15:04:33 2016 -0500 Add configure options and generate bli_config.h automatically. +commit bd44cf13e886069bc66c10ac0db178be96629a0d +Author: Devin Matthews +Date: Tue Apr 19 13:43:04 2016 -0500 + + Fix copy-paste errors in KNL kernels. + commit eb2f18e4844d985715df20798f50f9cc12e3b5ad Author: Field G. Van Zee Date: Tue Apr 19 12:50:32 2016 -0500 @@ -956,18 +1868,56 @@ Date: Tue Apr 19 11:44:37 2016 -0500 Lastly, support for OMP in clang has been added (closes #56). +commit a11eec05928ddc5c43fa5dbcd35f2edd24ff35a1 +Author: Devin Matthews +Date: Mon Apr 18 13:13:36 2016 -0500 + + Add sgemm ukernels for KNL. vpmullq is not implemented on KNL -- needs workaround. + commit ff84469a4575f1ef8a0010046fde52240a312cae Author: Field G. Van Zee Date: Mon Apr 18 12:29:09 2016 -0500 Applied various compilation fixes to bgq kernels. +commit c38e0dab05b2dc36672eab96e1248fb7fb2d785b +Merge: bd5e2296 cbcd0b73 +Author: Devin Matthews +Date: Mon Apr 18 10:21:35 2016 -0500 + + Merge remote-tracking branch 'origin/master' into knl + +commit bd5e2296e98e042c31f1e8ece2c1ca8e4bdc2d4c +Merge: 4745def0 49f85177 +Author: Devin Matthews +Date: Mon Apr 18 10:15:22 2016 -0500 + + Merge remote-tracking branch 'origin/knl' into knl + +commit 4745def0c87377ae83ad73ac514d7de08a96b2ac +Author: Devin Matthews +Date: Mon Apr 18 10:15:05 2016 -0500 + + Add 64-bit offset vector so we can use vgatherqpd. + +commit 49f85177f886f38889b60503a4e12fa7f04be1fd +Author: Devin Matthews +Date: Mon Apr 18 10:14:11 2016 -0500 + + KNL ukernel compiles with gcc. + commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f Author: Tyler Michael Smith Date: Mon Apr 18 03:12:57 2016 -0500 Changing ifdef for OSX pthread barriers +commit 58b2c3cf040134d1be913c585a3c6905629116c0 +Author: Devin Matthews +Date: Sat Apr 16 16:12:24 2016 -0500 + + Rewrite of KNL kernel in GNU extended asm syntax. + commit dd62080cea78f3a23616200d6640e52c102b2bb9 Author: Field G. Van Zee Date: Fri Apr 15 11:15:41 2016 -0500 @@ -984,7 +1934,7 @@ Date: Fri Apr 15 11:15:41 2016 -0500 website. commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a -Merge: 4320b72 4169467 +Merge: 4320b725 41694675 Author: Field G. Van Zee Date: Thu Apr 14 12:56:36 2016 -0500 @@ -1182,8 +2132,34 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. +commit dd856c2cb75a2221a503a73dde27790c34b91570 +Author: Devin Matthews +Date: Mon Apr 11 10:39:18 2016 -0500 + + Translated MIC kernel to KNL and cleaned up a bit. Only real change is lack of swizzle modifiers for FMA instructions (used bcast from memory instead). + +commit 7f27431d3fffdda99c282ec412731d0a90cb32a7 +Author: Devin Matthews +Date: Fri Apr 8 10:04:39 2016 -0500 + + Copy mic kernel to knl for transliteration. + +commit f8f02f0334ac020021e15a415bcd33aeea01deb4 +Merge: 32c92d94 d1f8e5d9 +Author: Devin Matthews +Date: Wed Apr 6 11:37:05 2016 -0500 + + Merge branch 'master' into const_correctness + +commit 32c92d945c55708da0eb63be1771f8c5430e3910 +Merge: 62914ccb 20af937b +Author: Devin Matthews +Date: Wed Apr 6 11:36:02 2016 -0500 + + Merge branch 'master' into const_correctness + commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 -Merge: 20af937 c11d28e +Merge: 20af937b c11d28ee Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -1198,7 +2174,7 @@ Date: Sat Apr 2 21:15:48 2016 +0200 cgemm µkernel for bulldozer : bug correction for k%4 != 0 commit 20af937b57f82bb3acb09418d5c0206e1b24f2c7 -Merge: 36c3abb fc61a11 +Merge: 36c3abb0 fc61a114 Author: Field G. Van Zee Date: Thu Mar 31 14:37:30 2016 -0500 @@ -1219,7 +2195,7 @@ Date: Thu Mar 31 10:45:48 2016 -0500 Adjust paths in common.mk to support building from testsuite dir. commit 36c3abb05fecb02d4a9ab13b2b69d133adf34583 -Merge: 64b41fa 917ce75 +Merge: 64b41fa5 917ce754 Author: Field G. Van Zee Date: Thu Mar 31 10:26:17 2016 -0500 @@ -1245,8 +2221,15 @@ Date: Wed Mar 30 22:03:09 2016 +0200 cgemm & zgemm micro-kernels for FMA4 instruction set (bulldozer configuration), based on x86_64/avx micro-kernel +commit 62914ccbcdb3c594f065dcfa65bd7e7b95c79283 +Merge: bbf704bf 64b41fa5 +Author: Devin Matthews +Date: Tue Mar 29 15:24:25 2016 -0500 + + Merge branch 'master' into const_correctness + commit 64b41fa554dff44b2f9ad48901b67c63836407a8 -Merge: 1b09e34 0171ad5 +Merge: 1b09e343 0171ad58 Author: Field G. Van Zee Date: Tue Mar 29 15:19:41 2016 -0500 @@ -1267,7 +2250,7 @@ Date: Mon Mar 28 13:55:06 2016 -0500 Add icc and clang support for Intel architectures, fixes #47. 2bd036f fixes #49 BTW. commit 3090fff64cc87ff2519a09f38e6b8699cf3cba11 -Merge: 8624e36 4ca5d5b +Merge: 8624e365 4ca5d5b1 Author: Field G. Van Zee Date: Mon Mar 28 12:36:25 2016 -0500 @@ -1276,14 +2259,14 @@ Date: Mon Mar 28 12:36:25 2016 -0500 sgemm micro-kernel for FMA4 instruction set commit e6e566426ac3ded7ef87cd8ff9be98accfdc4acc -Merge: 469429e 8624e36 +Merge: 469429ec 8624e365 Author: Devin Matthews Date: Sat Mar 26 14:10:15 2016 -0500 Merge branch 'master' into more_config_opts commit 8624e36543160739d954c4dbcc5a5594458f3a12 -Merge: a315833 2bd036f +Merge: a315833f 2bd036f1 Author: Field G. Van Zee Date: Sat Mar 26 13:56:28 2016 -0500 @@ -1310,7 +2293,7 @@ Date: Fri Mar 25 17:22:58 2016 -0500 Add threading option to configure. commit ad43eab4c7899d56d8d7caa6e2d92bc0581ea5a5 -Merge: 9452bdb 2bd036f +Merge: 9452bdb3 2bd036f1 Author: Devin Matthews Date: Fri Mar 25 15:00:02 2016 -0500 @@ -1328,8 +2311,14 @@ Date: Fri Mar 25 12:16:49 2016 -0500 Fix configuration issue where instruction set flags are not specified for debug builds. +commit bbf704bf7501411964a63a68f1af541f612cf92d +Author: Devin Matthews +Date: Fri Mar 25 09:55:35 2016 -0500 + + Add missing const to bli_read_nway_from_env. + commit a315833f067944fb0bc14cf60f0c7dcb5dc897b6 -Merge: 1d1a426 af92773 +Merge: 1d1a426d af92773f Author: Field G. Van Zee Date: Thu Mar 24 12:30:21 2016 -0500 @@ -1343,8 +2332,20 @@ Date: Wed Mar 23 22:07:02 2016 +0100 Updated and improved ARMv8 micro-kernels. +commit a4d7729776d17d9bdf2341eacd70b9770b9ba8d2 +Author: Devin Matthews +Date: Mon Mar 21 09:55:21 2016 -0500 + + Set default value for debug_type variable. + +commit 0e2447fa55d8c5fa2b1fc4150073512495c5f9eb +Author: Devin Matthews +Date: Thu Mar 17 16:32:05 2016 -0500 + + Add const correctness to auxinfo_t struct (microkernels need update theoretically). + commit 1d1a426d18ec03754021456862a1f4d1dfec1fbf -Merge: 5a978ff d226dfa +Merge: 5a978fff d226dfa0 Author: Field G. Van Zee Date: Mon Mar 7 15:17:53 2016 -0600 @@ -1364,7 +2365,7 @@ Date: Sat Mar 5 16:18:14 2016 -0600 4) Add make V=[0,1] option to control build verbosity. commit 5a978fffdb8f09a81c89541d541d4a6830cd70a4 -Merge: adb2b4e 63e2642 +Merge: adb2b4e0 63e26423 Author: Field G. Van Zee Date: Fri Mar 4 17:26:58 2016 -0600 @@ -1409,7 +2410,7 @@ Date: Mon Feb 29 21:53:12 2016 +0100 symbolic link for bulldozer configuration to kernels commit 2dc5c0ae038ed175fab85751803ada05734d1ba1 -Merge: f2809fc 3d0fae8 +Merge: f2809fc5 3d0fae81 Author: Field G. Van Zee Date: Mon Feb 29 12:22:51 2016 -0600 @@ -1418,7 +2419,7 @@ Date: Mon Feb 29 12:22:51 2016 -0600 Add symlink from config/bulldozer/kernels to kernels/x86_64/bulldozer commit f2809fc5f74466c755da6a5b4632853e634060b5 -Merge: f86b94f 8624a33 +Merge: f86b94f2 8624a33c Author: Field G. Van Zee Date: Sat Feb 27 13:06:03 2016 -0600 @@ -1542,7 +2543,7 @@ Date: Tue Nov 3 10:30:08 2015 -0600 smart enough to perform this optimization automatically. commit 0694b722f7e4df00efb32639095a2aca80e67f52 -Merge: 3e116f0 33557ec +Merge: 3e116f0a 33557ecc Author: Field G. Van Zee Date: Mon Nov 2 17:24:25 2015 -0600 @@ -1621,7 +2622,7 @@ Date: Fri Oct 30 18:25:04 2015 -0500 micro-kernels, and trsm_ll macro-kernel. commit 46294d80e5a79c598e200e1c8ec2a642ff839971 -Merge: d3159c5 a0a7b85 +Merge: d3159c57 a0a7b85a Author: Field G. Van Zee Date: Tue Oct 27 12:41:23 2015 -0500 @@ -1636,7 +2637,7 @@ Date: Tue Oct 27 08:59:15 2015 +0000 Fixed incomplete code in the double precision ARMv8 microkernel. commit d3159c5740c9ee7f8c0b661003aab6f00646ad6f -Merge: b489152 7e03e45 +Merge: b489152e 7e03e45b Author: Field G. Van Zee Date: Wed Oct 21 14:54:00 2015 -0500 @@ -1649,7 +2650,7 @@ Date: Wed Oct 21 14:53:17 2015 -0500 Use vzeroall in haswell micro-kernels. commit 7e03e45bfe6c27c4fdbf06b1caa7f49e9a5fef49 -Merge: 77ddb0b 4f88c29 +Merge: 77ddb0b1 4f88c29f Author: Field G. Van Zee Date: Wed Oct 14 13:26:07 2015 -0500 @@ -1664,7 +2665,7 @@ Date: Wed Oct 14 12:57:50 2015 -0500 Detect Intel Broadwell (using Haswell config). commit 4b0ac1a9984a93f7ad4369b10fca63991107d9f5 -Merge: fe3e355 77ddb0b +Merge: fe3e355c 77ddb0b1 Author: Zhang Xianyi Date: Wed Oct 14 12:51:05 2015 -0500 @@ -1771,7 +2772,7 @@ Date: Thu Sep 24 12:14:03 2015 -0500 bli_obj_row_off(), bli_obj_col_off(). commit fe3e355c9c5a6f65b8736b009e2d501b62a83ea1 -Merge: efa641e 4dd9dd3 +Merge: efa641e3 4dd9dd3e Author: Zhang Xianyi Date: Fri Aug 21 14:38:36 2015 -0500 @@ -1817,7 +2818,7 @@ Date: Wed Jul 29 13:31:09 2015 -0500 Version file update (0.1.8) commit ef0fbbbdb6148b96938733fce72cb4ed7dad685e -Merge: fdfe14f d4b8913 +Merge: fdfe14f1 d4b89136 Author: Field G. Van Zee Date: Thu Jul 9 13:54:54 2015 -0500 @@ -2085,7 +3086,7 @@ Date: Fri Apr 3 16:44:32 2015 -0500 - Added ACML support to test/3m4m driver Makefile and runme.sh script. commit a32f7c49ca4ea869d2a6c66818780f4321743d67 -Merge: 349e075 4bfd1ce +Merge: 349e075a 4bfd1ce8 Author: Field G. Van Zee Date: Fri Apr 3 08:28:11 2015 -0500 @@ -2279,7 +3280,7 @@ Date: Fri Feb 20 15:24:27 2015 -0600 return blocksizes from one of the induced methods' blocksize objects. commit 411e637ee7d1083a84f58f08938d51e63d7c3c9a -Merge: c2569b8 fc0b771 +Merge: c2569b88 fc0b7712 Author: Tyler Michael Smith Date: Fri Feb 20 20:39:25 2015 -0600 @@ -2345,14 +3346,14 @@ Date: Thu Feb 19 14:27:09 2015 -0600 the sandybridge configuration. commit 493087d730f01d5169434f461644e5633f48a42f -Merge: 650d2a6 2502129 +Merge: 650d2a6f 25021299 Author: Field G. Van Zee Date: Wed Feb 18 09:45:51 2015 -0600 Merge branch 'master' of github.com:flame/blis commit 25021299b670775df8ca9c87910c63d7e74ed946 -Merge: fe2b8d3 f05a576 +Merge: fe2b8d39 f05a5763 Author: Field G. Van Zee Date: Wed Feb 11 20:03:21 2015 -0600 @@ -2487,7 +3488,7 @@ Date: Tue Dec 16 11:27:50 2014 -0600 Added 4m_1b to test/3m4m test driver and script. commit 785d480805fc0d6f4251b5499933515740b6b2a7 -Merge: 9456f33 4156c08 +Merge: 9456f330 4156c088 Author: Field G. Van Zee Date: Fri Dec 12 14:34:19 2014 -0600 @@ -2539,7 +3540,7 @@ Date: Tue Dec 9 16:03:14 2014 -0600 leading us to this bug. commit 689f60a578b461119e9ea90c74f642b9eb79addb -Merge: bef24e6 483e4d6 +Merge: bef24e67 483e4d6a Author: Field G. Van Zee Date: Sun Dec 7 14:03:30 2014 -0600 @@ -2565,7 +3566,7 @@ Date: Wed Nov 26 18:00:56 2014 -0600 Barriers were inserted to fix this. commit 76bde44411f0e34266bab9d666a54ef22be97320 -Merge: e56e614 f3d729e +Merge: e56e6143 f3d729e5 Author: Field G. Van Zee Date: Wed Nov 26 17:25:24 2014 -0600 @@ -2610,7 +3611,7 @@ Date: Fri Nov 21 12:28:08 2014 -0600 - Updated comments on alignment of a1 and b1 to match wiki. commit 994429c6881b2ade92d9d7949bcaebfbf2cc65eb -Merge: 58796ab 694029d +Merge: 58796abd 694029d9 Author: Field G. Van Zee Date: Thu Nov 20 13:55:35 2014 -0600 @@ -2857,7 +3858,7 @@ Date: Fri Oct 10 10:01:45 2014 -0500 - Updated sandybridge configuration accordingly. commit 23ce7ee542a12ca40b4b6090ad2558d180e16d37 -Merge: 99fd9a3 7a8ad47 +Merge: 99fd9a39 7a8ad47f Author: Field G. Van Zee Date: Thu Oct 9 16:41:22 2014 -0500 @@ -2918,7 +3919,7 @@ Date: Mon Sep 29 14:56:36 2014 -0500 Fixed bug when packing anywhere besides in blk_var_1 for gemm. commit 614a4afc9272adb47e5a8b83b39d56c2804d95d6 -Merge: b541b66 4a7df04 +Merge: b541b667 4a7df04e Author: Tyler Smith Date: Fri Sep 26 10:49:57 2014 -0500 @@ -3008,7 +4009,7 @@ Date: Wed Sep 17 11:10:07 2014 -0500 implementations. Thanks to Devin Matthews for reporting this bug. commit 870761eb902e4866090d1d3446a345df3d6d4599 -Merge: e9899be a2b59a3 +Merge: e9899be0 a2b59a37 Author: Field G. Van Zee Date: Tue Sep 16 18:20:49 2014 -0500 @@ -3304,7 +4305,7 @@ Date: Thu Aug 28 11:55:12 2014 -0500 we now pass in the pack schema itself. commit a0ff6066e06075ab5f92b19247b39b92ed15f1bf -Merge: c4c99c4 d40b32b +Merge: c4c99c48 d40b32bc Author: Field G. Van Zee Date: Sun Aug 24 15:56:21 2014 -0500 @@ -3325,7 +4326,7 @@ Date: Sun Aug 24 15:52:22 2014 -0500 level-2 or level-3 operation. commit d40b32bc24ffbae24123e054307b3138969bb095 -Merge: 9331f79 6c25c37 +Merge: 9331f794 6c25c379 Author: Field G. Van Zee Date: Sun Aug 24 13:46:36 2014 -0500 @@ -3343,7 +4344,7 @@ Date: Sun Aug 24 13:44:10 2014 -0500 ukernels in commit 4cc2b46. commit 9331f79443223fe267676ee54c439e1ed320380c -Merge: 7fc48a7 670b639 +Merge: 7fc48a7d 670b6392 Author: Field G. Van Zee Date: Sun Aug 24 10:54:21 2014 -0500 @@ -3427,7 +4428,7 @@ Date: Thu Aug 21 18:25:48 2014 -0500 those blocksizes at runtime. commit b541b667cabfa6d41b50ad1e49209651ee6812cc -Merge: 699a815 dd61307 +Merge: 699a8151 dd61307f Author: Tyler Smith Date: Wed Aug 20 14:44:51 2014 -0500 @@ -3654,7 +4655,7 @@ Date: Mon Aug 4 15:49:59 2014 -0500 - Updated blis.h to include necessary CBLAS-related headers. commit caab62dac0fb0bd0d674118f409c81680db94d29 -Merge: 383631b db97ce9 +Merge: 383631b5 db97ce97 Author: Field G. Van Zee Date: Sun Aug 3 14:36:18 2014 -0500 @@ -3779,7 +4780,7 @@ Date: Sun Jul 27 18:20:12 2014 -0500 Version file update (0.1.4) commit acff74041bf02c7b9fdfa24b507bca782a4c5fce -Merge: cdb9413 47b243e +Merge: cdb9413e 47b243ef Author: Tyler Smith Date: Wed Jul 23 15:07:30 2014 -0500 @@ -3807,7 +4808,7 @@ Date: Wed Jul 23 13:41:13 2014 -0500 - Comment update. commit 3e7b0db5b0e24f5fd66c60bacabc019885ddbec5 -Merge: 2f8a357 ed3e33d +Merge: 2f8a357d ed3e33d5 Author: Tyler Smith Date: Wed Jul 23 13:40:44 2014 -0500 @@ -3853,7 +4854,7 @@ Date: Tue Jul 22 14:36:02 2014 -0500 matrix real-valued. commit 8965a965931318619ceaebd7c32edccf3022d0c7 -Merge: 1785efb 5b73e80 +Merge: 1785efb5 5b73e80b Author: Field G. Van Zee Date: Tue Jul 22 14:34:32 2014 -0500 @@ -3870,7 +4871,7 @@ Date: Tue Jul 22 14:33:01 2014 -0500 - Changed setd front-end call of scald_check() to setd_check(). commit 5b73e80b71c054c1945a06aff044ef629bc1a9a0 -Merge: a41e68e 20690fe +Merge: a41e68e0 20690fe3 Author: Field G. Van Zee Date: Fri Jul 18 12:21:20 2014 -0500 @@ -3942,7 +4943,7 @@ Date: Mon Jul 14 16:05:03 2014 -0500 2012). commit fcec68cda3f6e90ae055e7304e6674c1c5c8d010 -Merge: 94c0df7 4a20ed1 +Merge: 94c0df79 4a20ed1a Author: Field G. Van Zee Date: Mon Jul 14 11:35:34 2014 -0500 @@ -3977,7 +4978,7 @@ Date: Sun Jul 13 22:50:56 2014 -0700 Emscripten port commit 4a20ed1a3f5e9e5232df30aa0e568e6c00c56ce1 -Merge: 6a515e9 8ccdfae +Merge: 6a515e98 8ccdfaef Author: Field G. Van Zee Date: Sun Jul 13 17:45:01 2014 -0500 @@ -4076,7 +5077,7 @@ Date: Tue Jul 8 10:25:27 2014 -0500 - Added *.so files to '.gitignore'. commit 6c65e9a58fe55990ebb99ec3986443e18af35338 -Merge: cb12e45 daca500 +Merge: cb12e456 daca500d Author: Field G. Van Zee Date: Tue Jul 8 10:13:49 2014 -0500 @@ -4095,7 +5096,7 @@ Date: Tue Jul 8 10:07:46 2014 -0500 uninitialized. Thanks to Tony Kelman for isolating this bug. commit daca500db5e2448ba0da8047b75eb0f88d9f40e3 -Merge: ab3bc91 4702350 +Merge: ab3bc915 47023502 Author: Tyler Smith Date: Thu Jul 3 12:52:52 2014 -0500 @@ -4200,7 +5201,7 @@ Date: Mon Jun 23 10:42:29 2014 -0500 Removed 'version' from .gitignore file. commit b40dcefc5ee31f67aa3990e2e9d2ef8ed1386a25 -Merge: 7101a8e b693b0c +Merge: 7101a8ee b693b0cd Author: Field G. Van Zee Date: Mon Jun 23 10:39:05 2014 -0500 @@ -4215,7 +5216,7 @@ Date: Sun Jun 22 13:44:25 2014 -0700 [SC]AXPY kernels for PNaCl commit 7101a8eec0327d6c3a7eb36eb4b0fd45c1c6d162 -Merge: ad48dca 020a831 +Merge: ad48dca2 020a831b Author: Field G. Van Zee Date: Thu Jun 19 21:46:50 2014 -0500 @@ -4278,7 +5279,7 @@ Date: Sun Jun 15 06:27:37 2014 -0400 SGEMM and DGEMM kernels for PNaCl commit ad48dca22913a363899f0bef45553898718eebb1 -Merge: ee2b679 7118f87 +Merge: ee2b6792 7118f87e Author: Field G. Van Zee Date: Sat Jun 14 15:10:13 2014 -0500 @@ -4327,7 +5328,7 @@ Date: Wed May 21 11:34:42 2014 -0500 reporting this bug. commit 77a2d8dac8b242d7a202c9aabda3927ab68cf987 -Merge: 8c5d607 21fb089 +Merge: 8c5d6071 21fb0893 Author: Field G. Van Zee Date: Tue May 20 09:53:19 2014 -0500 @@ -4395,7 +5396,7 @@ Date: Wed Apr 30 12:28:00 2014 -0500 Replaced register blocksize hack with querying the register blocksize for determining parallelism granularity commit f4fdfe8fc573553eb36795b79cdf681270dab71b -Merge: 31bb065 8c5d607 +Merge: 31bb065b 8c5d6071 Author: Tyler Smith Date: Wed Apr 30 11:46:35 2014 -0500 @@ -4435,7 +5436,7 @@ Date: Mon Apr 28 16:48:25 2014 -0500 to Jack Poulson for reporting this bug. commit 31bb065ba40ae0c5a614e743b8025abca012b99e -Merge: 20e2443 7c61959 +Merge: 20e24430 7c619599 Author: Tyler Smith Date: Wed Apr 23 12:30:19 2014 -0500 @@ -4535,7 +5536,7 @@ Date: Fri Apr 4 10:22:48 2014 -0500 Also made herk IC and JC loops do weighted partitioning commit 2b6848b2397d6d84ca4e5f792fc51ad05e351a36 -Merge: 4e3eb39 21a0efb +Merge: 4e3eb39a 21a0efb3 Author: Tyler Smith Date: Fri Apr 4 09:54:54 2014 -0500 @@ -4654,7 +5655,7 @@ Date: Mon Mar 24 15:21:42 2014 -0500 a_next and b_next point to the current micropanels in trmm commit 23d9eab354fbc88165889832955e126772bf8488 -Merge: 5d5dc2e fd3e32a +Merge: 5d5dc2ee fd3e32a5 Author: Tyler Smith Date: Thu Mar 20 16:54:35 2014 -0500 @@ -4796,7 +5797,7 @@ Date: Mon Mar 10 15:47:28 2014 -0500 Added single threaded thread info data structures specifically for gemm and packm commit 0e8677761175189583ca7d855e24b2bbdd2dada8 -Merge: 2e727a0 b3bff63 +Merge: 2e727a02 b3bff631 Author: Tyler Smith Date: Mon Mar 10 15:16:21 2014 -0500 @@ -4829,14 +5830,14 @@ Date: Mon Mar 3 14:31:44 2014 -0600 are currently implemented in terms of isinf() and isnan() from math.h. commit b3bff631eadf98b15cb422fb4a8e2f855c23e8a7 -Merge: 2c158fb e8757b0 +Merge: 2c158fb8 e8757b03 Author: Tyler Smith Date: Thu Feb 27 16:53:24 2014 -0600 Merge https://github.com/flame/blis commit 2c158fb885c27f7b599dc1e85b57edd684f19223 -Merge: e4738c4 c2b2ab6 +Merge: e4738c48 c2b2ab62 Author: Tyler Smith Date: Thu Feb 27 16:46:23 2014 -0600 @@ -4896,7 +5897,7 @@ Date: Thu Feb 27 14:09:19 2014 -0600 Fixed bug in thread trees commit ac5a2de1d17ffd460b00fee9757898525a09abae -Merge: 01b125e bd3c7ec +Merge: 01b125e8 bd3c7ecf Author: Tyler Smith Date: Thu Feb 27 11:59:33 2014 -0600 @@ -4973,14 +5974,14 @@ Date: Tue Feb 25 13:34:56 2014 -0600 only the real gemm micro-kernel. commit 15b51e990f1d21333b5f7af97c211756247336e5 -Merge: 6363a9f fc04b5e +Merge: 6363a9f6 fc04b5eb Author: Field G. Van Zee Date: Fri Feb 21 09:04:32 2014 -0600 Merge branch 'master' of github.com:fgvanzee/blis commit fc04b5eb69868c341ce03f5ef1f02de4b8c121b0 -Merge: b29e1c2 d1813c9 +Merge: b29e1c2b d1813c9d Author: Field G. Van Zee Date: Fri Feb 21 09:04:13 2014 -0600 @@ -5023,7 +6024,7 @@ Date: Wed Feb 19 17:00:52 2014 -0600 - Various other minor changes to facilitate 4m/3m methods. commit b29e1c2b278c177e104c84ba462820ee8296df6c -Merge: ee60377 bd3c7ec +Merge: ee60377e bd3c7ecf Author: Field G. Van Zee Date: Fri Feb 14 14:11:54 2014 -0600 @@ -5676,7 +6677,7 @@ Date: Tue Dec 3 16:08:30 2013 -0600 beta are applied to the attached scalars. commit 992de486d6f23e69a623abd15ae77d7881d13871 -Merge: 9552e6e fd4ac63 +Merge: 9552e6ee fd4ac636 Author: Field G. Van Zee Date: Mon Dec 2 13:58:46 2013 -0600 @@ -5742,7 +6743,7 @@ Date: Mon Nov 18 18:11:07 2013 -0600 that already existed in kernels/x86_64/core2-sse3/3. commit 85e7e02ea3a9190b6fcff5d46b00d41c79cb1242 -Merge: 67761e2 7072005 +Merge: 67761e22 70720054 Author: Field G. Van Zee Date: Mon Nov 18 12:02:00 2013 -0600 @@ -6513,7 +7514,7 @@ Date: Thu Aug 1 11:24:23 2013 -0500 dimension of the gemm macro-kernel. commit f8980edf9c318453bb1962ac4939c06bf11e6d5e -Merge: 67a8b94 6e7e452 +Merge: 67a8b949 6e7e4523 Author: Field G. Van Zee Date: Fri Jul 26 11:14:27 2013 -0500 diff --git a/Makefile b/Makefile index 1a4868eaa..d74eba889 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/README.md b/README.md index 16789103a..b0ba6d345 100644 --- a/README.md +++ b/README.md @@ -260,7 +260,9 @@ We also have a third paper, submitted to IPDPS 2014, on achieving ``` A fourth paper, submitted to ACM TOMS, also exists, which proposes an -[analytical model](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf) for determining blocksize parameters in BLIS: +[analytical model](http://dl.acm.org/citation.cfm?id=2925987) +([unofficial backup link](http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)) +for determining blocksize parameters in BLIS: ``` @article{BLIS4, @@ -278,6 +280,32 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an } ``` +A fifth paper, submitted to ACM TOMS, begins the study of so-called +[induced methods for complex matrix multiplication](http://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf): + +``` +@article{BLIS5, + author = {Field G. {V}an~{Z}ee and Tyler Smith}, + title = {Implementing high-performance complex matrix multiplication via the 3m and 4m methods}, + journal = {ACM Transactions on Mathematical Software}, + year = {2017}, + note = {accepted} +} +``` + +A sixth paper, submitted to ACM TOMS, revisits the topic of the previous +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev0.pdf): + +``` +@article{BLIS6, + author = {Field G. {V}an~{Z}ee}, + title = {Implementing high-performance complex matrix multiplication via the 1m method}, + journal = {ACM Transactions on Mathematical Software}, + note = {submitted} +} +``` + + Funding ------- diff --git a/build/auto-detect/auto-detect.sh b/build/auto-detect/auto-detect.sh index 9300e3b8b..5185fd8af 100755 --- a/build/auto-detect/auto-detect.sh +++ b/build/auto-detect/auto-detect.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -41,7 +41,11 @@ main() { - CC=gcc + if [ clang -v > /dev/null 2>&1 ]; then + CC=clang + else + CC=gcc + fi CPUID_SRC=cpuid_x86.c CPUID_BIN=blis_cpu_detect ARCH=reference @@ -59,12 +63,6 @@ main() # of the distribution and the directory in which we are building. cur_dirpath="." - - OSNAME=`uname` - if [ $OSNAME = "Darwin" ]; then - CC=clang - fi - # # Detect architecture by predefined macros # diff --git a/build/bump-version.sh b/build/bump-version.sh index 35da91b97..53cbe1825 100755 --- a/build/bump-version.sh +++ b/build/bump-version.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/check-test.sh b/build/check-test.sh index 6277ada45..6fb082a4c 100755 --- a/build/check-test.sh +++ b/build/check-test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,4 +47,4 @@ if [ $? -eq 0 ]; then else echo "Test Pass" exit 0 -fi \ No newline at end of file +fi diff --git a/build/config.mk.in b/build/config.mk.in index 9d92f7fb4..e7a3f3235 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/fragment.mk b/build/gen-make-frags/fragment.mk index 08773302b..2a1eb6907 100644 --- a/build/gen-make-frags/fragment.mk +++ b/build/gen-make-frags/fragment.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/gen-make-frags/gen-make-frag.sh b/build/gen-make-frags/gen-make-frag.sh index e24af3005..19fdc5bd0 100755 --- a/build/gen-make-frags/gen-make-frag.sh +++ b/build/gen-make-frags/gen-make-frag.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -374,10 +374,6 @@ gen_mkfiles() read_mkfile_config() { - local index lname - declare -i count - - # Read the file describing file suffixes. src_file_suffixes=$(cat "${suffix_file}") diff --git a/build/mirror-tree.sh b/build/mirror-tree.sh index bac7ad9a6..813091fcf 100755 --- a/build/mirror-tree.sh +++ b/build/mirror-tree.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/templates/license.sh b/build/templates/license.sh index a9fc4b9fb..6105c1f04 100644 --- a/build/templates/license.sh +++ b/build/templates/license.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/build/update-version-file.sh b/build/update-version-file.sh index afa829e4a..23373022d 100755 --- a/build/update-version-file.sh +++ b/build/update-version-file.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/common.mk b/common.mk index 683d0b0e9..08731d9aa 100644 --- a/common.mk +++ b/common.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk index 40b6c179a..9d1b51d0a 100644 --- a/config/armv7a/make_defs.mk +++ b/config/armv7a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/armv8a/make_defs.mk b/config/armv8a/make_defs.mk index 654a9ff92..6d09af5cc 100644 --- a/config/armv8a/make_defs.mk +++ b/config/armv8a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/bgq/make_defs.mk b/config/bgq/make_defs.mk index 0f405102b..57c9899a0 100644 --- a/config/bgq/make_defs.mk +++ b/config/bgq/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/bulldozer/make_defs.mk b/config/bulldozer/make_defs.mk index 78f47d908..0546a474f 100644 --- a/config/bulldozer/make_defs.mk +++ b/config/bulldozer/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,8 +47,12 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +else +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). @@ -77,7 +81,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/carrizo/make_defs.mk b/config/carrizo/make_defs.mk index aaecb2d2c..f52d1dd67 100644 --- a/config/carrizo/make_defs.mk +++ b/config/carrizo/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,8 +47,12 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +else +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). @@ -77,7 +81,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a15/make_defs.mk b/config/cortex-a15/make_defs.mk index ec5360da4..053e11cbb 100644 --- a/config/cortex-a15/make_defs.mk +++ b/config/cortex-a15/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/cortex-a9/make_defs.mk b/config/cortex-a9/make_defs.mk index ec5360da4..053e11cbb 100644 --- a/config/cortex-a9/make_defs.mk +++ b/config/cortex-a9/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index fed36506b..8d07f2177 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/emscripten/make_defs.mk b/config/emscripten/make_defs.mk index 45b210ab6..4353d65cf 100644 --- a/config/emscripten/make_defs.mk +++ b/config/emscripten/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 4fed2a25a..63f9f06b5 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -52,17 +52,6 @@ // -- sgemm micro-kernel -- -#if 1 -#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 -#define BLIS_DEFAULT_MC_S 144 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 4080 -#define BLIS_DEFAULT_MR_S 6 -#define BLIS_DEFAULT_NR_S 16 - -#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 @@ -74,6 +63,17 @@ #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 +#define BLIS_DEFAULT_MC_S 144 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 4080 +#define BLIS_DEFAULT_MR_S 6 +#define BLIS_DEFAULT_NR_S 16 + +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 @@ -85,17 +85,6 @@ // -- dgemm micro-kernel -- -#if 1 -#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 -#define BLIS_DEFAULT_MC_D 72 -#define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 4080 -#define BLIS_DEFAULT_MR_D 6 -#define BLIS_DEFAULT_NR_D 8 - -#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS -#endif - #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 @@ -107,6 +96,17 @@ #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif +#if 1 +#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 +#define BLIS_DEFAULT_MC_D 72 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4080 +#define BLIS_DEFAULT_MR_D 6 +#define BLIS_DEFAULT_NR_D 8 + +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 diff --git a/config/haswell/make_defs.mk b/config/haswell/make_defs.mk index 1640a40b9..8c739607a 100644 --- a/config/haswell/make_defs.mk +++ b/config/haswell/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/knl/bli_kernel.h b/config/knl/bli_kernel.h index e32954973..46b1cb4f4 100644 --- a/config/knl/bli_kernel.h +++ b/config/knl/bli_kernel.h @@ -43,11 +43,22 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 +#ifdef BLIS_NO_HBWMALLOC + +#include + +#define BLIS_MALLOC_POOL malloc +#define BLIS_FREE_POOL free + +#else + #include #define BLIS_MALLOC_POOL hbw_malloc #define BLIS_FREE_POOL hbw_free +#endif + //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free diff --git a/config/knl/make_defs.mk b/config/knl/make_defs.mk index e0385e6d5..104abafe2 100644 --- a/config/knl/make_defs.mk +++ b/config/knl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -65,6 +65,10 @@ else COPTFLAGS := -O3 endif +ifeq ($(DEBUG_TYPE),sde) +CPPROCFLAGS += -DBLIS_NO_HBWMALLOC +endif + CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) @@ -95,7 +99,16 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared -LDFLAGS := -lm -lmemkind + +ifneq ($(DEBUG_TYPE),sde) +LDFLAGS := -lmemkind +else +LDFLAGS := +endif + +ifneq ($(CC_VENDOR),icc) +LDFLAGS += -lm +endif diff --git a/config/loongson3a/make_defs.mk b/config/loongson3a/make_defs.mk index bb1248d37..8fd9fb65a 100644 --- a/config/loongson3a/make_defs.mk +++ b/config/loongson3a/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/mic/make_defs.mk b/config/mic/make_defs.mk index 21af9e2e2..8e7738b44 100644 --- a/config/mic/make_defs.mk +++ b/config/mic/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,11 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifeq ($(CC_VENDOR),icc) +LDFLAGS := -mmic +else LDFLAGS := -mmic -lm +endif diff --git a/config/piledriver/make_defs.mk b/config/piledriver/make_defs.mk index e241789dd..b5c3f159c 100644 --- a/config/piledriver/make_defs.mk +++ b/config/piledriver/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -47,8 +47,12 @@ ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif -ifneq ($(CC_VENDOR),gcc) -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),gcc) +else +ifeq ($(CC_VENDOR),clang) +else +$(error gcc or clang are required for this configuration.) +endif endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). @@ -77,7 +81,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/pnacl/make_defs.mk b/config/pnacl/make_defs.mk index e957cf429..c6f629ef8 100644 --- a/config/pnacl/make_defs.mk +++ b/config/pnacl/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -63,7 +63,9 @@ ARFLAGS := rcs # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif # --- Determine the finalizer and related flags --- FINALIZER := pnacl-finalize diff --git a/config/power7/make_defs.mk b/config/power7/make_defs.mk index d03857a44..765344f79 100644 --- a/config/power7/make_defs.mk +++ b/config/power7/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/reference/make_defs.mk b/config/reference/make_defs.mk index 16e4b1294..f75b9ec55 100644 --- a/config/reference/make_defs.mk +++ b/config/reference/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -86,7 +86,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/sandybridge/make_defs.mk b/config/sandybridge/make_defs.mk index 082a73f92..d91df8b68 100644 --- a/config/sandybridge/make_defs.mk +++ b/config/sandybridge/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -88,7 +88,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/config/template/make_defs.mk b/config/template/make_defs.mk index 37de32882..d98452553 100644 --- a/config/template/make_defs.mk +++ b/config/template/make_defs.mk @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -77,7 +77,9 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared +ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm +endif diff --git a/configure b/configure index 2358575f6..7aabc5b78 100755 --- a/configure +++ b/configure @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like @@ -458,6 +458,9 @@ main() if [ -n "${debug_flag}" ]; then if [ "x${debug_type}" = "xopt" ]; then echo "${script_name}: enabling debug symbols with optimizations." + elif [ "x${debug_type}" = "xsde" ]; then + debug_type='sde' + echo "${script_name}: enabling SDE processor emulation." else debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index bdbb0063f..149c20320 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname, kertype ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ @@ -68,15 +68,15 @@ GENFRONT( swapv, BLIS_SWAPV_KER ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2, dep3, dep4 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ - PASTEMAC(dep3,_cntx_init)( cntx ); \ - PASTEMAC(dep4,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep3,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep4,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -93,12 +93,12 @@ GENFRONT( axpbyv, BLIS_AXPBYV_KER, axpyv, xpbyv, scal2v, scalv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -116,13 +116,13 @@ GENFRONT( scalv, BLIS_SCALV_KER, setv ) #undef GENFRONT #define GENFRONT( opname, kertype, dep1, dep2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(dep1,_cntx_init)( cntx ); \ - PASTEMAC(dep2,_cntx_init)( cntx ); \ + PASTEMAC(dep1,_cntx_init)( dt, cntx ); \ + PASTEMAC(dep2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index 95cd4a131..85756363b 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 74a548eea..6abf002f5 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -53,7 +53,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -88,7 +88,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -123,7 +123,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -198,7 +198,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -274,7 +274,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -306,7 +306,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -340,7 +340,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ @@ -373,7 +373,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1d/bli_l1d_cntx.c b/frame/1d/bli_l1d_cntx.c index d285995b1..443dc20f7 100644 --- a/frame/1d/bli_l1d_cntx.c +++ b/frame/1d/bli_l1d_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1d/bli_l1d_cntx.h b/frame/1d/bli_l1d_cntx.h index 50db79738..e5ab92f51 100644 --- a/frame/1d/bli_l1d_cntx.h +++ b/frame/1d/bli_l1d_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addd ) diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c index 5ef92603a..c8a67a138 100644 --- a/frame/1d/bli_l1d_tapi.c +++ b/frame/1d/bli_l1d_tapi.c @@ -90,7 +90,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -166,7 +166,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -222,7 +222,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -276,7 +276,7 @@ void PASTEMAC(ch,opname) \ x1 = x + offx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ @@ -349,7 +349,7 @@ void PASTEMAC(ch,opname) \ incx = 2*incx; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(chr,kername,_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx_p ); \ diff --git a/frame/1f/bli_l1f_cntx.c b/frame/1f/bli_l1f_cntx.c index 379cbce7d..58ca4a07c 100644 --- a/frame/1f/bli_l1f_cntx.c +++ b/frame/1f/bli_l1f_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -63,13 +63,13 @@ GENFRONT( axpy2v, BLIS_AXPY2V_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -86,12 +86,12 @@ GENFRONT( dotaxpyv, BLIS_DOTAXPYV_KER, dotxv, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ @@ -114,13 +114,13 @@ GENFRONT( axpyf, BLIS_AXPYF_KER, axpyv ) #undef GENFRONT #define GENFRONT( opname, kertype, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ \ /* Initialize the context with the kernel associated with the current operation. */ \ diff --git a/frame/1f/bli_l1f_cntx.h b/frame/1f/bli_l1f_cntx.h index 86b3af25f..bea56ca40 100644 --- a/frame/1f/bli_l1f_cntx.h +++ b/frame/1f/bli_l1f_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( axpy2v ) diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c index a7efd91f8..8c77a2465 100644 --- a/frame/1f/bli_l1f_tapi.c +++ b/frame/1f/bli_l1f_tapi.c @@ -57,7 +57,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -99,7 +99,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -142,7 +142,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -190,7 +190,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ @@ -238,7 +238,7 @@ void PASTEMAC(ch,opname) \ const num_t dt = PASTEMAC(ch,type); \ cntx_t* cntx_p; \ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx_p ); \ \ diff --git a/frame/1m/bli_l1m_cntx.c b/frame/1m/bli_l1m_cntx.c index 8569416fd..7eb3dcd4c 100644 --- a/frame/1m/bli_l1m_cntx.c +++ b/frame/1m/bli_l1m_cntx.c @@ -41,12 +41,12 @@ #undef GENFRONT #define GENFRONT( opname, depname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname,_cntx_init)( cntx ); \ + PASTEMAC(depname,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ @@ -64,13 +64,13 @@ GENFRONT( subm, subv ) #undef GENFRONT #define GENFRONT( opname, depname1, depname2 ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ bli_cntx_obj_create( cntx ); \ \ /* Initialize the context with kernel dependencies. */ \ - PASTEMAC(depname1,_cntx_init)( cntx ); \ - PASTEMAC(depname2,_cntx_init)( cntx ); \ + PASTEMAC(depname1,_cntx_init)( dt, cntx ); \ + PASTEMAC(depname2,_cntx_init)( dt, cntx ); \ } \ \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ diff --git a/frame/1m/bli_l1m_cntx.h b/frame/1m/bli_l1m_cntx.h index 46524fa0b..79e0524e8 100644 --- a/frame/1m/bli_l1m_cntx.h +++ b/frame/1m/bli_l1m_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addm ) diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h index 4361c9fac..2790bd006 100644 --- a/frame/1m/bli_l1m_ft.h +++ b/frame/1m/bli_l1m_ft.h @@ -91,6 +91,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \ ); INSERT_GENTDEF( packm_cxk_ker ) +INSERT_GENTDEF( packm_cxk_1er_ker ) // packm_3mis_ker diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c index c4dc5f9a8..13da24e59 100644 --- a/frame/1m/bli_l1m_tapi.c +++ b/frame/1m/bli_l1m_tapi.c @@ -54,12 +54,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -118,12 +119,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -187,7 +189,8 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ @@ -195,7 +198,7 @@ void PASTEMAC(ch,opname) \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -256,12 +259,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If alpha is zero, then we set the output matrix to zero. This seemingly minor optimization is important because it will clear @@ -344,12 +348,13 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 7a44ecb9f..991487dfd 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -48,9 +48,11 @@ #include "bli_packm_struc_cxk_4mi.h" #include "bli_packm_struc_cxk_3mis.h" #include "bli_packm_struc_cxk_rih.h" +#include "bli_packm_struc_cxk_1er.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_4mi.h" #include "bli_packm_cxk_3mis.h" #include "bli_packm_cxk_rih.h" +#include "bli_packm_cxk_1er.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 4ce7b1504..055d30f1f 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -90,6 +90,12 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = // 0111 row/col panels: real+imaginary only { { NULL, bli_cpackm_struc_cxk_rih, NULL, bli_zpackm_struc_cxk_rih, } }, +// 1000 row/col panels: 1m-expanded (1e) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, +// 1001 row/col panels: 1m-reordered (1r) + { { NULL, bli_cpackm_struc_cxk_1er, + NULL, bli_zpackm_struc_cxk_1er, } }, }; diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 4f570400a..2f4e0b030 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -39,7 +39,7 @@ // Define context initialization functions. // -void bli_packm_cntx_init( cntx_t* cntx ) +void bli_packm_cntx_init( num_t dt, cntx_t* cntx ) { bli_cntx_obj_create( cntx ); diff --git a/frame/1m/packm/bli_packm_cntx.h b/frame/1m/packm/bli_packm_cntx.h index 1ab4df826..2210a777b 100644 --- a/frame/1m/packm/bli_packm_cntx.h +++ b/frame/1m/packm/bli_packm_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( packm ) diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/packm/bli_packm_cxk_1er.c new file mode 100644 index 000000000..352ae8353 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.c @@ -0,0 +1,489 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_1er_ker_vft + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 32 + +static FUNCPTR_T ftypes_e[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1E_KERNEL, + NULL, BLIS_ZPACKM_2XK_1E_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1E_KERNEL, + NULL, BLIS_ZPACKM_4XK_1E_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1E_KERNEL, + NULL, BLIS_ZPACKM_6XK_1E_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1E_KERNEL, + NULL, BLIS_ZPACKM_8XK_1E_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1E_KERNEL, + NULL, BLIS_ZPACKM_10XK_1E_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1E_KERNEL, + NULL, BLIS_ZPACKM_12XK_1E_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1E_KERNEL, + NULL, BLIS_ZPACKM_14XK_1E_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1E_KERNEL, + NULL, BLIS_ZPACKM_16XK_1E_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1E_KERNEL, + NULL, BLIS_ZPACKM_30XK_1E_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + +static FUNCPTR_T ftypes_r[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* micro-panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 2 */ + { + NULL, BLIS_CPACKM_2XK_1R_KERNEL, + NULL, BLIS_ZPACKM_2XK_1R_KERNEL, + }, + /* micro-panel width = 3 */ + { + NULL, BLIS_CPACKM_3XK_1R_KERNEL, + NULL, BLIS_ZPACKM_3XK_1R_KERNEL, + }, + /* micro-panel width = 4 */ + { + NULL, BLIS_CPACKM_4XK_1R_KERNEL, + NULL, BLIS_ZPACKM_4XK_1R_KERNEL, + }, + /* micro-panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 6 */ + { + NULL, BLIS_CPACKM_6XK_1R_KERNEL, + NULL, BLIS_ZPACKM_6XK_1R_KERNEL, + }, + /* micro-panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 8 */ + { + NULL, BLIS_CPACKM_8XK_1R_KERNEL, + NULL, BLIS_ZPACKM_8XK_1R_KERNEL, + }, + /* micro-panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 10 */ + { + NULL, BLIS_CPACKM_10XK_1R_KERNEL, + NULL, BLIS_ZPACKM_10XK_1R_KERNEL, + }, + /* micro-panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 12 */ + { + NULL, BLIS_CPACKM_12XK_1R_KERNEL, + NULL, BLIS_ZPACKM_12XK_1R_KERNEL, + }, + /* micro-panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 14 */ + { + NULL, BLIS_CPACKM_14XK_1R_KERNEL, + NULL, BLIS_ZPACKM_14XK_1R_KERNEL, + }, + /* micro-panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 16 */ + { + NULL, BLIS_CPACKM_16XK_1R_KERNEL, + NULL, BLIS_ZPACKM_16XK_1R_KERNEL, + }, + /* micro-panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 18 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 19 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 20 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 21 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 22 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 23 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 24 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 25 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 26 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 27 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 28 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 29 */ + { + NULL, NULL, NULL, NULL, + }, + /* micro-panel width = 30 */ + { + NULL, BLIS_CPACKM_30XK_1R_KERNEL, + NULL, BLIS_ZPACKM_30XK_1R_KERNEL, + }, + /* micro-panel width = 31 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the micro-panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) \ + { \ + if ( bli_is_1e_packed( schema ) ) f = ftypes_e[panel_dim][dt]; \ + else /*( bli_is_1r_packed( schema ) )*/ f = ftypes_r[panel_dim][dt]; \ + } \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the micro-panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f \ + ( \ + conja, \ + panel_len, \ + kappa, \ + a, inca, lda, \ + p, ldp \ + ); \ + } \ + else \ + { \ + dim_t i, j; \ +\ + if ( bli_is_1e_packed( schema ) ) \ + { \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict a_ri = ( ctype* )a; \ + ctype* restrict p_ri = ( ctype* )p; \ + ctype* restrict p_ir = ( ctype* )p + ldp/2; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2j1es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype* restrict alpha11_ri = a_ri + (i )*inca + (j )*lda; \ + ctype* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ + ctype* restrict pi11_ir = p_ir + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal21es)( *kappa_cast, \ + *alpha11_ri, \ + *pi11_ri, \ + *pi11_ir ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + 1; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype_r* restrict p_i = ( ctype_r* )p + ldp; \ + const dim_t inca2 = 2*inca; \ + const dim_t lda2 = 2*lda; \ + const dim_t ldp2 = 2*ldp; \ +\ + /* Treat the micro-panel as panel_dim x panel_len and column-stored + (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2jris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < panel_len; ++j ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp2; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp2; \ +\ + PASTEMAC(ch,scal2ris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/packm/bli_packm_cxk_1er.h new file mode 100644 index 000000000..bd87216d0 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_1er.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_packm_cxk_1e_ref.h" +#include "bli_packm_cxk_1r_ref.h" + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + pack_t schema, \ + dim_t panel_dim, \ + dim_t panel_len, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index ccf88f3cb..d828f698d 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -121,11 +121,11 @@ siz_t bli_packm_init if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { - schema = bli_cntx_get_pack_schema_a( cntx ); + schema = bli_cntx_get_pack_schema_a_block( cntx ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { - schema = bli_cntx_get_pack_schema_b( cntx ); + schema = bli_cntx_get_pack_schema_b_panel( cntx ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/packm/bli_packm_struc_cxk_1er.c new file mode 100644 index 000000000..6ed34808f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.c @@ -0,0 +1,610 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t panel_len; \ + inc_t incc, ldc; \ + inc_t ldp; \ +\ +\ + /* Determine the dimensions and relative strides of the micro-panel + based on its pack schema. */ \ + if ( bli_is_col_packed( schema ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_row_packed( schema ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ +\ + /* Handle micro-panel packing based on the structure of the matrix + being packed. */ \ + if ( bli_is_general( strucc ) ) \ + { \ + /* For micro-panels of general matrices, we can call the pack + kernel front-end directly. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* Call a helper function for micro-panels of Hermitian/symmetric + matrices. */ \ + PASTEMAC(ch,packm_herm_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + uploc, \ + conjc, \ + schema, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ + else /* ( bli_is_triangular( strucc ) ) */ \ + { \ + /* Call a helper function for micro-panels of triangular + matrices. */ \ + PASTEMAC(ch,packm_tri_cxk_1er) \ + ( \ + strucc, \ + diagoffc, \ + diagc, \ + uploc, \ + conjc, \ + schema, \ + invdiag, \ + m_panel, \ + n_panel, \ + m_panel_max, \ + n_panel_max, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, rs_c, cs_c, \ + incc, ldc, \ + p, rs_p, cs_p, \ + ldp, \ + cntx \ + ); \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + { \ + if ( m_panel != m_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = m_panel; \ + dim_t offn = 0; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + dim_t offm = 0; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + zero, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ +\ + if ( bli_is_triangular( strucc ) ) \ + { \ + /* If this micro-panel is an edge case in both panel dimension and + length, then it must be a bottom-right corner case, which + typically only happens for micro-panels being packed for trsm. + (It also happens for trmm if kr > 1.) Here, we set the part of + the diagonal that extends into the zero-padded region to + identity. This prevents NaNs and Infs from creeping into the + computation. If this code does execute for trmm, it is okay, + because those 1.0's that extend into the bottom-right region + end up getting muliplied by the 0.0's in the zero-padded region + of the other matrix. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + ctype* restrict one = PASTEMAC(ch,1); \ + dim_t offm = m_panel; \ + dim_t offn = n_panel; \ + dim_t m_edge = m_panel_max - m_panel; \ + dim_t n_edge = n_panel_max - n_panel; \ +\ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + offm, \ + offn, \ + m_edge, \ + n_edge, \ + one, \ + p, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +\ +\ +/* + if ( bli_is_1r_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ + \ + if ( bli_is_1e_packed( schema ) ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \ + ( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +} + +INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffc_abs; \ + dim_t j; \ + bool_t row_stored; \ + bool_t col_stored; \ +\ +\ + /* Create flags to incidate row or column storage. Note that the + schema bit that encodes row or column is describing the form of + micro-panel, not the storage in the micro-panel. Hence the + mismatch in "row" and "column" semantics. */ \ + row_stored = bli_is_col_packed( schema ); \ + col_stored = bli_is_row_packed( schema ); \ +\ + /* Handle the case where the micro-panel does NOT intersect the + diagonal separately from the case where it does intersect. */ \ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + ctype* restrict c10; \ + ctype* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( col_stored && diagoffc < 0 ) || \ + ( row_stored && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( row_stored && bli_is_upper( uploc ) ) || \ + ( col_stored && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \ + ( col_stored && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc10, \ + schema, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, ldp, \ + cntx \ + ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc12, \ + schema, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, ldp, \ + cntx \ + ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t j = diagoffc_abs; \ + ctype* restrict c11 = c + (j )*ldc; \ + ctype* restrict p11 = p + (j )*ldp; \ +\ + PASTEMAC(ch,scal21ms_mxn_uplo) \ + ( \ + schema, \ + uploc, \ + conjc, \ + panel_dim, \ + kappa, \ + c11, rs_c, cs_c, \ + p11, rs_p, cs_p, ldp \ + ); \ +\ + /* If we are packing a micro-panel with Hermitian structure, + we must take special care of the diagonal. Now, if kappa + were guaranteed to be unit, all we would need to do is + explicitly zero out the imaginary part of the diagonal of + p11, in case the diagonal of the source matrix contained + garbage (non-zero) imaginary values. HOWEVER, since kappa + can be non-unit, things become a little more complicated. + In general, we must re-apply the kappa scalar to ONLY the + real part of the diagonal of the source matrix and save + the result to the diagonal of p11. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + ctype_r* restrict c11_r = ( ctype_r* )c11; \ + const dim_t rs_c2 = 2*rs_c; \ + const dim_t cs_c2 = 2*cs_c; \ +\ + PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ + ( \ + schema, \ + panel_dim, \ + panel_dim, \ + kappa, \ + c11_r, rs_c2, cs_c2, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ) \ +{ \ + doff_t diagoffp_abs = bli_abs( diagoffp ); \ + ctype* p11 = p + (diagoffp_abs )*ldp; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,kername) \ + ( \ + conjc, \ + schema, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, ldp, \ + cntx \ + ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + PASTEMAC(ch,set1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + kappa, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + PASTEMAC(ch,invert1ms_mxn_diag) \ + ( \ + schema, \ + 0, \ + 0, \ + panel_dim, \ + panel_dim, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ +\ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + ctype* restrict zero = PASTEMAC(ch,0); \ + uplo_t uplop = uploc; \ + doff_t diagoffp11_0 = 0; \ + dim_t p11_0_dim = panel_dim - 1; \ +\ + bli_toggle_uplo( uplop ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp11_0 ); \ +\ + /* Note that this macro works a little differently than the setm + operation. Here, we pass in the dimensions of only p11, rather + than the whole micro-panel, and furthermore we pass in the + "shrunken" dimensions of p11, corresponding to the toggling + and shrinking of the diagonal above. The macro will do the + right thing, incrementing the pointer to p11 by the appropriate + leading dimension (cs_p or rs_p), and setting only the lower + or upper triangle to zero. */ \ + PASTEMAC(ch,set1ms_mxn_uplo) \ + ( \ + schema, \ + diagoffp11_0, \ + uplop, \ + p11_0_dim, \ + p11_0_dim, \ + zero, \ + p11, rs_p, cs_p, ldp \ + ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) + diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/packm/bli_packm_struc_cxk_1er.h new file mode 100644 index 000000000..b0b1d0a2f --- /dev/null +++ b/frame/1m/packm/bli_packm_struc_cxk_1er.h @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t is_p, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_struc_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_1er ) + + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + pack_t schema, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + dim_t panel_dim, \ + dim_t panel_len, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + inc_t incc, inc_t ldc, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p, \ + inc_t ldp, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_1er ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c new file mode 100644 index 000000000..9f2acdce8 --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.c @@ -0,0 +1,1099 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1e_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca1 = inca; \ + const inc_t lda1 = lda; \ + const inc_t ldp1 = ldp; \ +\ + ctype* restrict kappa_cast = ( ctype* )kappa; \ + ctype* restrict alpha1_ri = ( ctype* )a; \ + ctype* restrict pi1_ri = ( ctype* )p; \ + ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copyj1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,copy1es)( *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +16*inca1), *(pi1_ri +16), *(pi1_ir +16) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +17*inca1), *(pi1_ri +17), *(pi1_ir +17) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +18*inca1), *(pi1_ri +18), *(pi1_ir +18) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +19*inca1), *(pi1_ri +19), *(pi1_ir +19) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +20*inca1), *(pi1_ri +20), *(pi1_ir +20) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +21*inca1), *(pi1_ri +21), *(pi1_ir +21) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +22*inca1), *(pi1_ri +22), *(pi1_ir +22) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +23*inca1), *(pi1_ri +23), *(pi1_ir +23) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +24*inca1), *(pi1_ri +24), *(pi1_ir +24) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +25*inca1), *(pi1_ri +25), *(pi1_ir +25) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +26*inca1), *(pi1_ri +26), *(pi1_ir +26) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +27*inca1), *(pi1_ri +27), *(pi1_ir +27) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +28*inca1), *(pi1_ri +28), *(pi1_ir +28) ); \ + PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +29*inca1), *(pi1_ri +29), *(pi1_ir +29) ); \ +\ + alpha1_ri += lda1; \ + pi1_ri += ldp1; \ + pi1_ir += ldp1; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1e_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h new file mode 100644 index 000000000..beebdafdc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1e_ref.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1e_ref +// 1e format should probably never have an odd-numbered register blocking. +//#undef packm_3xk_ker_name +//#define packm_3xk_ker_name packm_3xk_1e_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1e_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1e_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1e_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1e_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1e_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1e_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1e_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1e_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c new file mode 100644 index 000000000..6e30ca5bc --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.c @@ -0,0 +1,1254 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_2xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_3xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_4xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_6xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_8xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_10xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_12xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_14xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_16xk_1r_ref ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + conj_t conja, \ + dim_t n, \ + void* restrict kappa, \ + void* restrict a, inc_t inca, inc_t lda, \ + void* restrict p, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ + const inc_t ldp2 = 2 * ldp; \ +\ + ctype* kappa_cast = kappa; \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ +\ + if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +16*inca2), *(alpha1_i +16*inca2), *(pi1_r +16), *(pi1_i +16) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +17*inca2), *(alpha1_i +17*inca2), *(pi1_r +17), *(pi1_i +17) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +18*inca2), *(alpha1_i +18*inca2), *(pi1_r +18), *(pi1_i +18) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +19*inca2), *(alpha1_i +19*inca2), *(pi1_r +19), *(pi1_i +19) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +20*inca2), *(alpha1_i +20*inca2), *(pi1_r +20), *(pi1_i +20) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +21*inca2), *(alpha1_i +21*inca2), *(pi1_r +21), *(pi1_i +21) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +22*inca2), *(alpha1_i +22*inca2), *(pi1_r +22), *(pi1_i +22) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +23*inca2), *(alpha1_i +23*inca2), *(pi1_r +23), *(pi1_i +23) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +24*inca2), *(alpha1_i +24*inca2), *(pi1_r +24), *(pi1_i +24) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +25*inca2), *(alpha1_i +25*inca2), *(pi1_r +25), *(pi1_i +25) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +26*inca2), *(alpha1_i +26*inca2), *(pi1_r +26), *(pi1_i +26) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +27*inca2), *(alpha1_i +27*inca2), *(pi1_r +27), *(pi1_i +27) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +28*inca2), *(alpha1_i +28*inca2), *(pi1_r +28), *(pi1_i +28) ); \ + PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +29*inca2), *(alpha1_i +29*inca2), *(pi1_r +29), *(pi1_i +29) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp2; \ + pi1_i += ldp2; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_30xk_1r_ref ) + diff --git a/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h new file mode 100644 index 000000000..a6e3f0aef --- /dev/null +++ b/frame/1m/packm/ukernels/bli_packm_cxk_1r_ref.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Redefine level-1m kernel API names to induce prototypes. + +#undef packm_2xk_ker_name +#define packm_2xk_ker_name packm_2xk_1r_ref +#undef packm_3xk_ker_name +#define packm_3xk_ker_name packm_3xk_1r_ref +#undef packm_4xk_ker_name +#define packm_4xk_ker_name packm_4xk_1r_ref +#undef packm_6xk_ker_name +#define packm_6xk_ker_name packm_6xk_1r_ref +#undef packm_8xk_ker_name +#define packm_8xk_ker_name packm_8xk_1r_ref +#undef packm_10xk_ker_name +#define packm_10xk_ker_name packm_10xk_1r_ref +#undef packm_12xk_ker_name +#define packm_12xk_ker_name packm_12xk_1r_ref +#undef packm_14xk_ker_name +#define packm_14xk_ker_name packm_14xk_1r_ref +#undef packm_16xk_ker_name +#define packm_16xk_ker_name packm_16xk_1r_ref +#undef packm_30xk_ker_name +#define packm_30xk_ker_name packm_30xk_1r_ref + +// Include the level-1m kernel API template. + +#include "bli_l1m_ker.h" + diff --git a/frame/2/bli_l2_cntx.c b/frame/2/bli_l2_cntx.c index 841217365..fdfe27a85 100644 --- a/frame/2/bli_l2_cntx.c +++ b/frame/2/bli_l2_cntx.c @@ -41,7 +41,7 @@ #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -50,20 +50,20 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -88,7 +88,7 @@ GENFRONT( trsv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -96,10 +96,10 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /* Initialize the context with kernels employed by the current operation. */ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -122,7 +122,7 @@ GENFRONT( syr ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -133,22 +133,22 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPYF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXF_KER, cntx );*/ \ /*bli_gks_cntx_set_l1f_ker( BLIS_DOTXAXPYF_KER, cntx );*/ \ - bli_dotaxpyv_cntx_init( cntx ); \ - bli_axpyf_cntx_init( cntx ); \ - bli_dotxf_cntx_init( cntx ); \ - bli_dotxaxpyf_cntx_init( cntx ); \ + bli_dotaxpyv_cntx_init( dt, cntx ); \ + bli_axpyf_cntx_init( dt, cntx ); \ + bli_dotxf_cntx_init( dt, cntx ); \ + bli_dotxaxpyf_cntx_init( dt, cntx ); \ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_DOTXV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );*/ \ - bli_axpyv_cntx_init( cntx ); \ - bli_dotxv_cntx_init( cntx ); \ - bli_scalv_cntx_init( cntx ); \ - bli_setv_cntx_init( cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ + bli_dotxv_cntx_init( dt, cntx ); \ + bli_scalv_cntx_init( dt, cntx ); \ + bli_setv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ @@ -173,7 +173,7 @@ GENFRONT( symv ) #undef GENFRONT #define GENFRONT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ) \ { \ /* Perform basic setup on the context. */ \ bli_cntx_obj_create( cntx ); \ @@ -182,11 +182,11 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ) \ operation. */ \ /*bli_gks_cntx_set_l1f_ker( BLIS_AXPY2V_KER, cntx );*/ \ /*bli_gks_cntx_set_l1v_ker( BLIS_AXPYV_KER, cntx );*/ \ - bli_axpy2v_cntx_init( cntx ); \ - bli_axpyv_cntx_init( cntx ); \ + bli_axpy2v_cntx_init( dt, cntx ); \ + bli_axpyv_cntx_init( dt, cntx ); \ \ /* Initialize the context with packm-related kernels. */ \ - bli_packm_cntx_init( cntx ); \ + bli_packm_cntx_init( dt, cntx ); \ \ /* Set the register and cache blocksizes and multiples, as well as the execution method. */ \ diff --git a/frame/2/bli_l2_cntx.h b/frame/2/bli_l2_cntx.h index 8b6566f55..a3bafa0c8 100644 --- a/frame/2/bli_l2_cntx.h +++ b/frame/2/bli_l2_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemv ) diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c index 24558fd9d..f2681d7d8 100644 --- a/frame/2/bli_l2_tapi.c +++ b/frame/2/bli_l2_tapi.c @@ -55,8 +55,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - dim_t m_y, n_x; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + dim_t m_y, n_x; \ \ /* Determine the dimensions of y and x. */ \ bli_set_dims_with_trans( transa, m, n, m_y, n_x ); \ @@ -65,7 +66,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( m_y ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -135,13 +136,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x or y has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -188,10 +190,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ @@ -261,8 +264,9 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ - ctype alpha_local; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ + ctype alpha_local; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \ @@ -273,7 +277,7 @@ void PASTEMAC(ch,opname) \ PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -324,13 +328,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -383,13 +388,14 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_ft) f; \ @@ -444,10 +450,11 @@ void PASTEMAC(ch,opname) \ cntx_t* cntx \ ) \ { \ - cntx_t* cntx_p; \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ \ /* Initialize a local context if the given context is NULL. */ \ - bli_cntx_init_local_if( opname, cntx, cntx_p ); \ + bli_cntx_init_local_if( opname, dt, cntx, cntx_p ); \ \ /* If x has zero elements, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index a8dfee1ba..4fe3fe7f5 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -70,8 +70,8 @@ void bli_l3_cntl_create_if else { // If the user provided a control tree, create a copy and use it - // instead (so that it can be used to cache things like pack mem_t - // entries). + // instead (so that threads can use its local tree as a place to + // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); } } diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 634e4c1ab..161e68160 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -38,7 +38,7 @@ // Define context initialization functions. // -void bli_gemm_cntx_init( cntx_t* cntx ) +void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -49,7 +49,7 @@ void bli_gemm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. @@ -63,9 +63,8 @@ void bli_gemm_cntx_init( cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_gemm_cntx_finalize( cntx_t* cntx ) @@ -74,7 +73,7 @@ void bli_gemm_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm_cntx_init( cntx_t* cntx ) +void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) { // Clear the context fields. bli_cntx_obj_clear( cntx ); @@ -92,7 +91,7 @@ void bli_trsm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), given the execution method. @@ -106,9 +105,8 @@ void bli_trsm_cntx_init( cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_trsm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/3/bli_l3_cntx.h b/frame/3/bli_l3_cntx.h index 21b756656..223fa5e25 100644 --- a/frame/3/bli_l3_cntx.h +++ b/frame/3/bli_l3_cntx.h @@ -40,7 +40,7 @@ #undef GENPROT #define GENPROT( opname ) \ \ -void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ +void PASTEMAC(opname,_cntx_init)( num_t dt, cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( gemm ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b3494b174..775ca2544 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create opid_t family ) { - void* macro_kernel_p = bli_gemm_ker_var2; + return bli_gemmbp_cntl_create( family ); +} +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var2; // Change the macro-kernel if the operation family is herk or trmm. if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; @@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( - bli_gemm_packa, + bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, BLIS_MR, BLIS_KR, @@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create ( - bli_gemm_packb, + bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, BLIS_KR, BLIS_NR, @@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create return gemm_cntl_vl_mm; } +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var1; + + // Change the macro-kernel if the operation family is herk or trmm. + //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; + + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); + + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_ub_ke + ); + + // Create a node for packing matrix A (which is really the right-hand + // operand "B"). + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, // pack the right-hand operand + bli_packm_blk_var1, + BLIS_KR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_pb_ub + ); + + // Create a node for partitioning the n dimension by MC. + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var2, + gemm_cntl_packb + ); + + // Create a node for packing matrix B (which is really the left-hand + // operand "A"). + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, // pack the left-hand operand + bli_packm_blk_var1, + BLIS_NR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_pb + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packa + ); + + // Create a node for partitioning the m dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var1, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; +} + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5b985327c..6da6cd768 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create opid_t family ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ); + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ); + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index ad645411e..0c30b2f7b 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -112,5 +112,6 @@ void bli_gemm_front cntl ); } + } diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c new file mode 100644 index 000000000..7b485a6b7 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var1.c @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ker_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Implement _ker_var1() in terms of _ker_var2() by transposing the + // entire suboperation (which also requires swapping A and B). + + bli_obj_induce_trans( *a ); + bli_obj_induce_trans( *b ); + bli_obj_induce_trans( *c ); + + bli_gemm_ker_var2( b, a, c, cntx, cntl, thread ); +} + diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 8af29594d..c27a0b67c 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -109,6 +109,26 @@ void bli_gemm_ker_var2 buf_alpha = bli_obj_internal_scalar_buffer( scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( *c ); + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. +#if 1 + if ( bli_is_1m_packed( schema_a ) ) + { + bli_l3_ind_recast_1m_params + ( + dt_exec, + schema_a, + c, + m, n, k, + pd_a, ps_a, + pd_b, ps_b, + rs_c, cs_c + ); + } +#endif + // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index c66587fda..88412c3d8 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_packa ) GENPROT( gemm_packb ) +GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 833dadb42..0f8e38688 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -85,6 +85,7 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, @@ -116,6 +117,66 @@ void bli_blksz_reduce_dt_to bli_blksz_set_def( blksz_def, dt_bs, blksz ); bli_blksz_set_max( blksz_max, dt_bs, blksz ); } +#endif + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the default and maximum blocksize values down to their + // respective nearest multiples of bmult_val. (Notice that we + // ignore the "max" entry in the bmult object since that would + // correspond to the packing dimension, which plays no role + // as a blocksize multiple.) + blksz_def = ( blksz_def / bmult_val ) * bmult_val; + + // Make sure the new blocksize values are at least the blocksize + // multiple. + if ( blksz_def == 0 ) blksz_def = bmult_val; + + // Store the new blocksizes back to the object. + bli_blksz_set_def( blksz_def, dt_bs, blksz ); +} + +// ----------------------------------------------------------------------------- + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) +{ + dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); + + dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); + + // If the blocksize multiple is zero, we do nothing. + if ( bmult_val == 0 ) return; + + // Round the blocksize values down to its nearest multiple of + // of bmult_val. (Notice that we ignore the "max" entry in the + // bmult object since that would correspond to the packing + // dimension, which plays no role as a blocksize multiple.) + blksz_max = ( blksz_max / bmult_val ) * bmult_val; + + // Make sure the new blocksize value is at least the blocksize + // multiple. + if ( blksz_max == 0 ) blksz_max = bmult_val; + + // Store the new blocksize back to the object. + bli_blksz_set_max( blksz_max, dt_bs, blksz ); +} // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index daffb3772..cfe2023e1 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -89,11 +89,23 @@ (b_dst)->e[ dt_dst ] = (b_src)->e[ dt_src ]; \ } +#define bli_blksz_scale_def( num, den, dt, b ) \ +{ \ + (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ +} + +#define bli_blksz_scale_max( num, den, dt, b ) \ +{ \ + (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ +} + +#if 0 #define bli_blksz_scale_dt_by( num, den, dt, b ) \ { \ (b)->v[ dt ] = ( (b)->v[ dt ] * num ) / den; \ (b)->e[ dt ] = ( (b)->e[ dt ] * num ) / den; \ } +#endif // ----------------------------------------------------------------------------- @@ -121,12 +133,25 @@ void bli_blksz_obj_free // ----------------------------------------------------------------------------- +#if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); +#endif +void bli_blksz_reduce_def_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); + +void bli_blksz_reduce_max_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 2b45a5de3..cac290da9 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -97,6 +97,16 @@ void bli_cntl_free cntl_t* cntl, thrinfo_t* thread ) +{ + if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread ); + else bli_cntl_free_wo_thrinfo( cntl ); +} + +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; @@ -112,7 +122,7 @@ void bli_cntl_free { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. @@ -122,8 +132,8 @@ void bli_cntl_free } // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the current thread - // is chief for its group, and only if the mem_t is allocated. + // broker from which it originated, but only if the mem_t entry is + // allocated, and only if the current thread is chief for its group. if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { @@ -134,6 +144,42 @@ void bli_cntl_free bli_cntl_obj_free( cntl ); } +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free_wo_thrinfo( cntl_sub_node ); + } + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the mem_t entry is + // allocated. + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 7b6000bb9..fd0413f4f 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -75,12 +75,25 @@ void bli_cntl_obj_clear cntl_t* cntl ); +// ----------------------------------------------------------------------------- + void bli_cntl_free ( cntl_t* cntl, thrinfo_t* thread ); +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ); + cntl_t* bli_cntl_copy ( cntl_t* cntl diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index bd9972332..673987bfd 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx ) return bli_cntx_method( cntx ); } -pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ) { - return bli_cntx_schema_a( cntx ); + return bli_cntx_schema_a_block( cntx ); } -pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ) { - return bli_cntx_schema_b( cntx ); + return bli_cntx_schema_b_panel( cntx ); +} + +pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ) +{ + return bli_cntx_schema_c_panel( cntx ); +} + +bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ) +{ + return bli_cntx_anti_pref( cntx ); } #endif @@ -386,27 +396,27 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_cntx_set_blkszs( + void bli_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; @@ -414,7 +424,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszid_t* bszids; blksz_t** blkszs; bszid_t* bmults; - dim_t* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; @@ -426,7 +437,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( dim_t ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -444,9 +456,9 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -464,18 +476,21 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the address of the blksz_t object, and // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - const bszid_t bs_id = va_arg( args, bszid_t ); - blksz_t* blksz = va_arg( args, blksz_t* ); - const bszid_t bm_id = va_arg( args, bszid_t ); - const dim_t scalr = va_arg( args, dim_t ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + blksz_t* blksz = va_arg( args, blksz_t* ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -510,12 +525,12 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* blksz = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. Do the same @@ -534,14 +549,15 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blksz_t pointer, blocksize // multiple id, and blocksize scalar. - const bszid_t bs_id = bszids[ i ]; - const bszid_t bm_id = bmults[ i ]; - const dim_t scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t* blksz = blkszs[ i ]; - blksz_t* bmult = blkszs[ i ]; + blksz_t* blksz = blkszs[ i ]; + blksz_t* bmult = blkszs[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. @@ -550,20 +566,50 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -575,7 +621,8 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) bli_free_intl( blkszs ); bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } #endif @@ -668,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method, bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c( pack_t schema_c, +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ) +{ + bli_cntx_set_schema_c_panel( schema_c, cntx ); +} + +#if 0 +void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, cntx_t* cntx ) { - bli_cntx_set_schema_c( schema_c, cntx ); + bli_cntx_set_anti_pref( anti_pref, cntx ); } +#endif void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, dim_t m, dim_t n, dim_t k ) @@ -729,12 +784,20 @@ void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, } } - jc = bli_env_read_nway( "BLIS_JC_NT", jc ); - //pc = bli_env_read_nway( "BLIS_KC_NT", 1 ); - pc = 1; - ic = bli_env_read_nway( "BLIS_IC_NT", ic ); - jr = bli_env_read_nway( "BLIS_JR_NT", jr ); - ir = bli_env_read_nway( "BLIS_IR_NT", ir ); + pc = 1; + + dim_t jc_env = bli_env_read_nway( "BLIS_JC_NT", -1 ); + dim_t ic_env = bli_env_read_nway( "BLIS_IC_NT", -1 ); + dim_t jr_env = bli_env_read_nway( "BLIS_JR_NT", -1 ); + dim_t ir_env = bli_env_read_nway( "BLIS_IR_NT", -1 ); + + if (jc_env != -1 || ic_env != -1 || jr_env != -1 || ir_env != -1) + { + jc = (jc_env == -1 ? 1 : jc_env); + ic = (ic_env == -1 ? 1 : ic_env); + jr = (jr_env == -1 ? 1 : jr_env); + ir = (ir_env == -1 ? 1 : ir_env); + } #else @@ -867,6 +930,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +// ----------------------------------------------------------------------------- + bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) @@ -916,6 +1005,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 38bff6720..a76cdd329 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + bool_t anti_pref; + dim_t* thrloop; membrk_t* membrk; @@ -113,26 +115,30 @@ typedef struct cntx_s \ ( (cntx)->method ) -#define bli_cntx_schema_a( cntx ) \ +#define bli_cntx_schema_a_block( cntx ) \ \ - ( (cntx)->schema_a ) + ( (cntx)->schema_a_block ) -#define bli_cntx_schema_b( cntx ) \ +#define bli_cntx_schema_b_panel( cntx ) \ \ - ( (cntx)->schema_b ) + ( (cntx)->schema_b_panel ) -#define bli_cntx_schema_c( cntx ) \ +#define bli_cntx_schema_c_panel( cntx ) \ \ - ( (cntx)->schema_c ) + ( (cntx)->schema_c_panel ) -#define bli_cntx_membrk( cntx ) \ +#define bli_cntx_anti_pref( cntx ) \ \ - ( (cntx)->membrk ) + ( (cntx)->anti_pref ) #define bli_cntx_thrloop( cntx ) \ \ ( (cntx)->thrloop ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + #if 1 #define bli_cntx_jc_way( cntx ) \ \ @@ -211,24 +217,24 @@ typedef struct cntx_s (cntx_p)->method = _method; \ } -#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ +#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a_block = _schema_a_block; \ } -#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ +#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b_panel = _schema_b_panel; \ } -#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ +#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c_panel = _schema_c_panel; \ } -#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \ { \ - (cntx_p)->membrk = _membrk; \ + (cntx_p)->anti_pref = _anti_pref; \ } #define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ @@ -241,6 +247,11 @@ typedef struct cntx_s (cntx_p)->thrloop[ BLIS_KR ] = 1; \ } +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -323,13 +334,17 @@ typedef struct cntx_s \ bli_cntx_method( cntx ) -#define bli_cntx_get_pack_schema_a( cntx ) \ +#define bli_cntx_get_pack_schema_a_block( cntx ) \ \ - bli_cntx_schema_a( cntx ) + bli_cntx_schema_a_block( cntx ) -#define bli_cntx_get_pack_schema_b( cntx ) \ +#define bli_cntx_get_pack_schema_b_panel( cntx ) \ \ - bli_cntx_schema_b( cntx ) + bli_cntx_schema_b_panel( cntx ) + +#define bli_cntx_get_pack_schema_c_panel( cntx ) \ +\ + bli_cntx_schema_c_panel( cntx ) #define bli_cntx_get_membrk( cntx ) \ \ @@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); // l1vkr_t ker_id, // cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ); +//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ); dim_t bli_cntx_get_num_threads( cntx_t* cntx ); dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); @@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, cntx_t* cntx ); -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_c( pack_t schema_c, - cntx_t* cntx ); +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ); +//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, +// cntx_t* cntx ); void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, @@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); @@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function @@ -488,13 +518,13 @@ void bli_cntx_print( cntx_t* cntx ); // pointer is NULL. When initializing, the context address that should // be used (local or external) is assigned to cntx_p. -#define bli_cntx_init_local_if( opname, cntx, cntx_p ) \ +#define bli_cntx_init_local_if( opname, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC(opname,_cntx_init)( &_cntx_l ); \ + PASTEMAC(opname,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ @@ -510,13 +540,13 @@ void bli_cntx_print( cntx_t* cntx ); } -#define bli_cntx_init_local_if2( opname, suf, cntx, cntx_p ) \ +#define bli_cntx_init_local_if2( opname, suf, dt, cntx, cntx_p ) \ \ cntx_t _cntx_l; \ \ if ( bli_is_null( cntx ) ) \ { \ - PASTEMAC2(opname,suf,_cntx_init)( &_cntx_l ); \ + PASTEMAC2(opname,suf,_cntx_init)( dt, &_cntx_l ); \ cntx_p = &_cntx_l; \ } \ else \ diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 7f3f897d5..2ada1556e 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -94,48 +94,47 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { /* Example prototypes: - void - bli_gks_cntx_set_blkszs( + void bli_gks_cntx_set_blkszs + ( + ind_t method = BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, + bszid_t bs1_id, bszid_t bm1_id, + bszid_t bs2_id, bszid_t bm2_id, + ... + cntx_t* cntx + ); - ind_t method = BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, - bszid_t bs1_id, bszid_t bm1_id, - bszid_t bs2_id, bszid_t bm2_id, - ... - cntx_t* cntx ); - - void - bli_gks_cntx_set_blkszs( - - ind_t method != BLIS_NAT, - dim_t n_bs, - bszid_t bs0_id, bszid_t bm0_id, dim_t scalr0, - bszid_t bs1_id, bszid_t bm1_id, dim_t scalr1, - bszid_t bs2_id, bszid_t bm2_id, dim_t scalr2, - ... - cntx_t* cntx ); + void bli_gks_cntx_set_blkszs + ( + ind_t method != BLIS_NAT, + dim_t n_bs, + bszid_t bs0_id, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, + bszid_t bs1_id, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, + bszid_t bs2_id, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, + ... + cntx_t* cntx + ); */ va_list args; dim_t i; bszid_t* bszids; bszid_t* bmults; - double* scalrs; + double* dsclrs; + double* msclrs; cntx_t* cntx; blksz_t* cntx_blkszs; bszid_t* cntx_bmults; - bszid_t bs_id; - bszid_t bm_id; - double scalr; // Allocate some temporary local arrays. bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); - scalrs = bli_malloc_intl( n_bs * sizeof( double ) ); + dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -152,8 +151,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; @@ -169,16 +168,19 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // - the bszid_t of the blocksize we're about to process, // - the bszid_t of the multiple we need to associate with // the blksz_t object. - // - the scalar we wish to apply to the real blocksizes to - // come up with the induced complex blocksizes. - bs_id = va_arg( args, bszid_t ); - bm_id = va_arg( args, bszid_t ); - scalr = va_arg( args, double ); + // - the scalars we wish to apply to the real blocksizes to + // come up with the induced complex blocksizes (for default + // and maximum blocksizes). + bszid_t bs_id = va_arg( args, bszid_t ); + bszid_t bm_id = va_arg( args, bszid_t ); + double dsclr = va_arg( args, double ); + double msclr = va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; bmults[ i ] = bm_id; - scalrs[ i ] = scalr; + dsclrs[ i ] = dsclr; + msclrs[ i ] = msclr; } } @@ -210,10 +212,10 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blocksize multiple id. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and save // them directly into the appropriate location in the context's @@ -231,41 +233,75 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // Read the current blocksize id, blocksize multiple id, // and blocksize scalar. - bszid_t bs_id = bszids[ i ]; - bszid_t bm_id = bmults[ i ]; - double scalr = scalrs[ i ]; + bszid_t bs_id = bszids[ i ]; + bszid_t bm_id = bmults[ i ]; + double dsclr = dsclrs[ i ]; + double msclr = msclrs[ i ]; - blksz_t blksz; - blksz_t bmult; + blksz_t blksz_l; + blksz_t bmult_l; - blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; + blksz_t* blksz = &blksz_l; + blksz_t* bmult = &bmult_l; + + blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the blocksizes (blksz_t) associated with bs_id and bm_id // and use them to populate a pair of local blksz_t objects. - bli_gks_get_blksz( bs_id, &blksz ); - bli_gks_get_blksz( bm_id, &bmult ); + bli_gks_get_blksz( bs_id, blksz ); + bli_gks_get_blksz( bm_id, bmult ); // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_FLOAT, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DOUBLE, cntx_blksz ); - bli_blksz_copy_dt( BLIS_FLOAT, &blksz, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_copy_dt( BLIS_DOUBLE, &blksz, BLIS_DCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); + bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); - // The next steps apply only to cache blocksizes, and not register - // blocksizes (ie: they only apply to blocksizes for which the - // blocksize multiple id is different than the blocksize id) and - // only when the scalar provided is non-unit. - if ( bs_id != bm_id && scalr != 1.0 ) + // If the default blocksize scalar is non-unit, we need to scale + // the complex domain default blocksizes. + if ( dsclr != 1.0 ) { - // Scale the complex domain values in the blocksize object. - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_scale_dt_by( 1, (dim_t)scalr, BLIS_DCOMPLEX, cntx_blksz ); + // Scale the complex domain default blocksize values in the + // blocksize object. + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); - // Finally, round the newly-scaled blocksizes down to their - // respective multiples. - bli_blksz_reduce_dt_to( BLIS_FLOAT, &bmult, BLIS_SCOMPLEX, cntx_blksz ); - bli_blksz_reduce_dt_to( BLIS_DOUBLE, &bmult, BLIS_DCOMPLEX, cntx_blksz ); + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } + } + + // Similarly, if the maximum blocksize scalar is non-unit, we need + // to scale the complex domain maximum blocksizes. + if ( msclr != 1.0 ) + { + // Scale the complex domain maximum blocksize values in the + // blocksize object. + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); + + if ( bs_id != bm_id ) + { + // Round the newly-scaled blocksizes down to their multiple. + // (Note that both the default and maximum blocksize values + // must be a multiple of the same blocksize multiple.) Also, + // note that this is only done when the blocksize id is not + // equal to the blocksize multiple id (ie: we don't round + // down scaled register blocksizes since they are their own + // multiples). + bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz ); + bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz ); + } } // Copy the blocksize multiple id into the context. @@ -276,7 +312,8 @@ void bli_gks_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Free the temporary local arrays. bli_free_intl( bszids ); bli_free_intl( bmults ); - bli_free_intl( scalrs ); + bli_free_intl( dsclrs ); + bli_free_intl( msclrs ); } @@ -337,6 +374,18 @@ static func_t bli_gks_l3_ind_ukrs[BLIS_NUM_IND_METHODS] /* trsm_l */ { { NULL, BLIS_CTRSM4M1_L_UKERNEL, NULL, BLIS_ZTRSM4M1_L_UKERNEL, } }, /* trsm_u */ { { NULL, BLIS_CTRSM4M1_U_UKERNEL, NULL, BLIS_ZTRSM4M1_U_UKERNEL, } }, }, +/* 1m */ { +/* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM1M_UKERNEL, + BLIS_DGEMM_UKERNEL, BLIS_ZGEMM1M_UKERNEL, } }, +/* gemmtrsm_l */ { { NULL, BLIS_CGEMMTRSM1M_L_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_L_UKERNEL, } }, +/* gemmtrsm_u */ { { NULL, BLIS_CGEMMTRSM1M_U_UKERNEL, + NULL, BLIS_ZGEMMTRSM1M_U_UKERNEL, } }, +/* trsm_l */ { { NULL, BLIS_CTRSM1M_L_UKERNEL, + NULL, BLIS_ZTRSM1M_L_UKERNEL, } }, +/* trsm_u */ { { NULL, BLIS_CTRSM1M_U_UKERNEL, + NULL, BLIS_ZTRSM1M_U_UKERNEL, } }, + }, /* nat */ { /* gemm */ { { BLIS_SGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL, } }, @@ -557,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ]; bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref ); + + // Explicitly set the anti-preference to FALSE. + bli_cntx_set_anti_pref( FALSE, cntx ); } @@ -565,6 +617,8 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, // -- packm structure-aware kernel structure ----------------------------------- // +// IF ENABLED: NEEDS UPDATING FOR 1M. + static func_t bli_gks_packm_struc_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index e66aafa63..06cbae587 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -61,8 +61,10 @@ void bli_memsys_init( void ) if ( bli_memsys_is_init == TRUE ) return; // Create and initialize a context for gemm so we have something - // to pass into bli_membrk_init_pools(). - bli_gemm_cntx_init( &cntx ); + // to pass into bli_membrk_init_pools(). We use BLIS_DOUBLE for + // the datatype, but the dt argument is actually only used when + // initializing contexts for induced methods. + bli_gemm_cntx_init( BLIS_DOUBLE, &cntx ); #ifdef BLIS_ENABLE_OPENMP _Pragma( "omp critical (mem)" ) diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 0d5992900..a7a69243e 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -877,6 +877,12 @@ bli_obj_width_stored( obj ) (obj).n_panel = n0; \ } +#define bli_obj_set_panel_dims( m0, n0, obj ) \ +{ \ + bli_obj_set_panel_length( m0, obj ); \ + bli_obj_set_panel_width( n0, obj ); \ +} + #define bli_obj_set_panel_dim( panel_dim, obj ) \ { \ (obj).pd = panel_dim; \ @@ -985,6 +991,7 @@ bli_obj_width_stored( obj ) #define bli_obj_induce_trans( obj ) \ { \ { \ + /* Induce transposition among basic fields. */ \ dim_t m_ = bli_obj_length( obj ); \ dim_t n_ = bli_obj_width( obj ); \ inc_t rs_ = bli_obj_row_stride( obj ); \ @@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj ) \ if ( bli_obj_is_upper_or_lower( obj ) ) \ bli_obj_toggle_uplo( obj ); \ +\ + /* Induce transposition among packed fields. */ \ + dim_t m_padded_ = bli_obj_padded_length( obj ); \ + dim_t n_padded_ = bli_obj_padded_width( obj ); \ + dim_t m_panel_ = bli_obj_panel_length( obj ); \ + dim_t n_panel_ = bli_obj_panel_width( obj ); \ +\ + bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \ + bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \ \ /* Note that this macro DOES NOT touch the transposition bit! If the calling code is using this macro to handle an object whose diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 50ddd5d1f..f0a208886 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -654,6 +654,19 @@ bli_is_io_packed( schema ) || \ bli_is_rpi_packed( schema ) ) +#define bli_is_1r_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ) + +#define bli_is_1e_packed( schema ) \ +\ + ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ) + +#define bli_is_1m_packed( schema ) \ +\ + ( bli_is_1r_packed( schema ) || \ + bli_is_1e_packed( schema ) ) + #define bli_is_nat_packed( schema ) \ \ ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ) diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index 1069a40b4..de8dbf370 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -225,6 +225,43 @@ #include "bli_scal2jrpis.h" +// -- 1m-specific scalar macros -- + +#include "bli_invert1ms_mxn_diag.h" + +#include "bli_scal1ms_mxn.h" + +#include "bli_scal21ms_mxn_diag.h" +#include "bli_scal21ms_mxn_uplo.h" + +#include "bli_set1ms_mxn.h" +#include "bli_set1ms_mxn_diag.h" +#include "bli_set1ms_mxn_uplo.h" +#include "bli_seti01ms_mxn_diag.h" + +// 1e +#include "bli_copy1es.h" +#include "bli_copyj1es.h" + +#include "bli_invert1es.h" + +#include "bli_scal1es.h" + +#include "bli_scal21es.h" +#include "bli_scal2j1es.h" + +// 1r +#include "bli_copy1rs.h" +#include "bli_copyj1rs.h" + +#include "bli_invert1rs.h" + +#include "bli_scal1rs.h" + +#include "bli_scal21rs.h" +#include "bli_scal2j1rs.h" + + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index d3548031c..1a120d5da 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -224,6 +224,10 @@ typedef dcomplex f77_dcomplex; - 1 0110 11: packed imag-only column panels - 1 0111 10: packed real+imag row panels - 1 0111 11: packed real+imag column panels + - 1 1000 10: packed by 1m expanded row panels + - 1 1000 11: packed by 1m expanded column panels + - 1 1001 10: packed by 1m reordered row panels + - 1 1001 11: packed by 1m reordered column panels 23 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper @@ -329,6 +333,8 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) @@ -348,6 +354,10 @@ typedef dcomplex f77_dcomplex; #define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -469,13 +479,17 @@ typedef enum BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO, BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI, BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI, + BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, + BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R, } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the // schema pair for "4ms" (4m separated), because its bit value has // been reserved, even though we don't use it. -#define BLIS_NUM_PACK_SCHEMA_TYPES 8 +#define BLIS_NUM_PACK_SCHEMA_TYPES 10 // -- Pack order type -- @@ -575,6 +589,7 @@ typedef enum BLIS_4MH, BLIS_4M1B, BLIS_4M1A, + BLIS_1M, BLIS_NAT, } ind_t; @@ -960,9 +975,11 @@ typedef struct cntx_s opid_t family; ind_t method; - pack_t schema_a; - pack_t schema_b; - pack_t schema_c; + pack_t schema_a_block; + pack_t schema_b_panel; + pack_t schema_c_panel; + + bool_t anti_pref; dim_t thrloop[ BLIS_NUM_LOOPS ]; diff --git a/frame/include/level0/1e/bli_copy1es.h b/frame/include/level0/1e/bli_copy1es.h new file mode 100644 index 000000000..22eec1565 --- /dev/null +++ b/frame/include/level0/1e/bli_copy1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1ES_H +#define BLIS_COPY1ES_H + +// copy1es + +#define bli_ccopy1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopy1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_copyj1es.h b/frame/include/level0/1e/bli_copyj1es.h new file mode 100644 index 000000000..14c401354 --- /dev/null +++ b/frame/include/level0/1e/bli_copyj1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1ES_H +#define BLIS_COPYJ1ES_H + +// copyj1es + +#define bli_ccopyj1es( a, bri, bir ) \ +{ \ + bli_ccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ +} + +#define bli_zcopyj1es( a, bri, bir ) \ +{ \ + bli_zcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_invert1es.h b/frame/include/level0/1e/bli_invert1es.h new file mode 100644 index 000000000..2fe5c3f24 --- /dev/null +++ b/frame/include/level0/1e/bli_invert1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1ES_H +#define BLIS_INVERT1ES_H + +// invert1es + +#define bli_cinvert1es( bri, bir ) \ +{ \ + bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ + bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ +} + +#define bli_zinvert1es( bri, bir ) \ +{ \ + bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ + bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal1es.h b/frame/include/level0/1e/bli_scal1es.h new file mode 100644 index 000000000..46ee20a0d --- /dev/null +++ b/frame/include/level0/1e/bli_scal1es.h @@ -0,0 +1,53 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1ES_H +#define BLIS_SCAL1ES_H + +// scal1es + +#define bli_cscal1es( a, yri, yir ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ + bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal1es( a, yri, yir ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal21es.h b/frame/include/level0/1e/bli_scal21es.h new file mode 100644 index 000000000..7e0a752bc --- /dev/null +++ b/frame/include/level0/1e/bli_scal21es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21ES_H +#define BLIS_SCAL21ES_H + +// scal21es + +#define bli_cscal21es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal21es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal21es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal21es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1e/bli_scal2j1es.h b/frame/include/level0/1e/bli_scal2j1es.h new file mode 100644 index 000000000..b10004f61 --- /dev/null +++ b/frame/include/level0/1e/bli_scal2j1es.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1ES_H +#define BLIS_SCAL2J1ES_H + +// scal2j1es + +#define bli_cscal2j1es( a, x, yri, yir ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_zscal2j1es( a, x, yri, yir ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#define bli_scscal2j1es( a, x, yri, yir ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_creal(yri), bli_cimag(yri) ); \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_creal(yir), bli_cimag(yir) ); \ +} + +#define bli_dzscal2j1es( a, x, yri, yir ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ +} + +#endif + diff --git a/frame/include/level0/1m/bli_invert1ms_mxn_diag.h b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h new file mode 100644 index 000000000..7abf891ef --- /dev/null +++ b/frame/include/level0/1m/bli_invert1ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1MS_MXN_DIAG_H +#define BLIS_INVERT1MS_MXN_DIAG_H + +// invert1ms_mxn_diag + +#define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal1ms_mxn.h b/frame/include/level0/1m/bli_scal1ms_mxn.h new file mode 100644 index 000000000..a0a9c595f --- /dev/null +++ b/frame/include/level0/1m/bli_scal1ms_mxn.h @@ -0,0 +1,124 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1MS_MXN_H +#define BLIS_SCAL1MS_MXN_H + +// scal1ms_mxn + +#define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_cscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, + which steps in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + bli_zscal1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_diag.h b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h new file mode 100644 index 000000000..a8975f731 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_diag.h @@ -0,0 +1,126 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_DIAG_H +#define BLIS_SCAL21MS_MXN_DIAG_H + +// scal21ms_mxn_diag + +#define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_scscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21es( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dzscal21rs( *(x + i*rs_x + i*cs_x), \ + *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h new file mode 100644 index 000000000..ccd5d4ef8 --- /dev/null +++ b/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h @@ -0,0 +1,296 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21MS_MXN_UPLO_H +#define BLIS_SCAL21MS_MXN_UPLO_H + +// scal21ms_mxn_uplo + +#define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_ri = y; \ + scomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_r = y_cast; \ + float* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_cscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_ri = y; \ + dcomplex* restrict y_ir = y + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21es( *(a), \ + *(x + i*rs_x + j*cs_x), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_r = y_cast; \ + double* restrict y_i = y_cast + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + if ( bli_is_conj( conjx ) ) \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal2j1rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_noconj( conjx ) ) */ \ + { \ + for ( j = 0; j < m; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zscal21rs( *(a), \ + *(x + i*rs_x + j*cs_x ), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn.h b/frame/include/level0/1m/bli_set1ms_mxn.h new file mode 100644 index 000000000..9f701c919 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn.h @@ -0,0 +1,164 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_H +#define BLIS_SET1MS_MXN_H + +// set1ms_mxn + +#define bli_cset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + inc_t offm_local = offm; \ + inc_t offn_local = offn; \ + dim_t m_local = m; \ + dim_t n_local = n; \ + inc_t rs_y1 = rs_y; \ + inc_t cs_y1 = cs_y; \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ + dim_t i, j; \ +\ + /* Optimization: The loops walk through y with unit stride if y is + column-stored. If y is row-stored, swap the dimensions and strides + to preserve unit stride movement. */ \ + if ( cs_y == 1 ) \ + { \ + bli_swap_incs( offm_local, offn_local ); \ + bli_swap_dims( m_local, n_local ); \ + bli_swap_incs( rs_y1, cs_y1 ); \ + bli_swap_incs( rs_y2, cs_y2 ); \ + } \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1; \ + dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 \ + + (offn_local )*cs_y1 + ld_y/2; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y1 + j*cs_y1), \ + *(y_off_ir + i*rs_y1 + j*cs_y1) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm_local )*rs_y2 \ + + (offn_local )*cs_y2 + ld_y; \ +\ + for ( j = 0; j < n_local; ++j ) \ + for ( i = 0; i < m_local; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + j*cs_y2), \ + *(y_off_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_diag.h b/frame/include/level0/1m/bli_set1ms_mxn_diag.h new file mode 100644 index 000000000..63262dd18 --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_diag.h @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_DIAG_H +#define BLIS_SET1MS_MXN_DIAG_H + +// set1ms_mxn_diag + +#define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + scomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + + (offn )*cs_y; \ + dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + + (offn )*cs_y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_off_ri + i*rs_y + i*cs_y), \ + *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2; \ + double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + + (offn )*cs_y2 + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_off_r + i*rs_y2 + i*cs_y2), \ + *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_set1ms_mxn_uplo.h b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h new file mode 100644 index 000000000..e89f9a34d --- /dev/null +++ b/frame/include/level0/1m/bli_set1ms_mxn_uplo.h @@ -0,0 +1,198 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1MS_MXN_UPLO_H +#define BLIS_SET1MS_MXN_UPLO_H + +// set1ms_mxn_uplo + +#define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + scomplex* restrict y_ri = y0; \ + scomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ + float* restrict y_r = y0; \ + float* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_ccopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ +{ \ + doff_t diagoff_abs = bli_abs( diagoff ); \ + inc_t offdiag_inc; \ + dim_t i, j; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ +\ + dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ + dcomplex* restrict y_ri = y0; \ + dcomplex* restrict y_ir = y0 + ld_y/2; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1es( *(a), \ + *(y_ri + i*rs_y + j*cs_y), \ + *(y_ir + i*rs_y + j*cs_y) ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + /* Set the off-diagonal increment. */ \ + if ( diagoff > 0 ) offdiag_inc = cs_y2; \ + else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ +\ + double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ + double* restrict y_r = y0; \ + double* restrict y_i = y0 + ld_y; \ +\ + if ( bli_is_lower( uplo ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = j; i < m; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + else /* if ( bli_is_upper( uplo ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < j + 1; ++i ) \ + { \ + bli_zcopy1rs( *(a), \ + *(y_r + i*rs_y2 + j*cs_y2), \ + *(y_i + i*rs_y2 + j*cs_y2) ); \ + } \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1m/bli_seti01ms_mxn_diag.h b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h new file mode 100644 index 000000000..39be51ca5 --- /dev/null +++ b/frame/include/level0/1m/bli_seti01ms_mxn_diag.h @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETI01MS_MXN_DIAG_H +#define BLIS_SETI01MS_MXN_DIAG_H + +// seti01ms_mxn_diag + +#define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + scomplex* restrict y_off_ri = y; \ + scomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + float* restrict y_cast = ( float* )y; \ + float* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ +{ \ + dim_t min_m_n = bli_min( m, n ); \ + dim_t i; \ +\ + /* Handle 1e and 1r separately. */ \ + if ( bli_is_1e_packed( schema ) ) \ + { \ + dcomplex* restrict y_off_ri = y; \ + dcomplex* restrict y_off_ir = y + ld_y/2; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ + bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema ) ) */ \ + { \ + inc_t rs_y2 = rs_y; \ + inc_t cs_y2 = cs_y; \ +\ + /* Scale the non-unit stride by two for the 1r loop, which steps + in units of real (not complex) values. */ \ + if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ + else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ +\ + double* restrict y_cast = ( double* )y; \ + double* restrict y_off_i = y_cast + ld_y; \ +\ + for ( i = 0; i < min_m_n; ++i ) \ + { \ + bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ + } \ + } \ +} + +#endif diff --git a/frame/include/level0/1r/bli_copy1rs.h b/frame/include/level0/1r/bli_copy1rs.h new file mode 100644 index 000000000..d60cf9d86 --- /dev/null +++ b/frame/include/level0/1r/bli_copy1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPY1RS_H +#define BLIS_COPY1RS_H + +// copy1rs + +#define bli_ccopy1rs( a, br, bi ) \ +{ \ + bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopy1rs( a, br, bi ) \ +{ \ + bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_copyj1rs.h b/frame/include/level0/1r/bli_copyj1rs.h new file mode 100644 index 000000000..8cc82f558 --- /dev/null +++ b/frame/include/level0/1r/bli_copyj1rs.h @@ -0,0 +1,51 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJ1RS_H +#define BLIS_COPYJ1RS_H + +// copyj1rs + +#define bli_ccopyj1rs( a, br, bi ) \ +{ \ + bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ +} + +#define bli_zcopyj1rs( a, br, bi ) \ +{ \ + bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_invert1rs.h b/frame/include/level0/1r/bli_invert1rs.h new file mode 100644 index 000000000..3b3a6950c --- /dev/null +++ b/frame/include/level0/1r/bli_invert1rs.h @@ -0,0 +1,43 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_INVERT1RS_H +#define BLIS_INVERT1RS_H + +// invert1rs + +#define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) +#define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) + +#endif diff --git a/frame/include/level0/1r/bli_scal1rs.h b/frame/include/level0/1r/bli_scal1rs.h new file mode 100644 index 000000000..ec65ab664 --- /dev/null +++ b/frame/include/level0/1r/bli_scal1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL1RS_H +#define BLIS_SCAL1RS_H + +// scal1rs + +#define bli_cscal1rs( a, yr, yi ) \ +{ \ + bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ +} + +#define bli_zscal1rs( a, yr, yi ) \ +{ \ + bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ +} + +#define bli_scscal1rs( a, yr, yi ) \ +{ \ + bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ +} + +#define bli_dzscal1rs( a, yr, yi ) \ +{ \ + bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal21rs.h b/frame/include/level0/1r/bli_scal21rs.h new file mode 100644 index 000000000..44d4f083f --- /dev/null +++ b/frame/include/level0/1r/bli_scal21rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL21RS_H +#define BLIS_SCAL21RS_H + +// scal21rs + +#define bli_cscal21rs( a, x, yr, yi ) \ +{ \ + bli_cscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal21rs( a, x, yr, yi ) \ +{ \ + bli_zscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal21rs( a, x, yr, yi ) \ +{ \ + bli_scscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal21rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/1r/bli_scal2j1rs.h b/frame/include/level0/1r/bli_scal2j1rs.h new file mode 100644 index 000000000..6a356932f --- /dev/null +++ b/frame/include/level0/1r/bli_scal2j1rs.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2J1RS_H +#define BLIS_SCAL2J1RS_H + +// scal2j1rs + +#define bli_cscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_zscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#define bli_scscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_scscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ +} + +#define bli_dzscal2j1rs( a, x, yr, yi ) \ +{ \ + bli_dzscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ +} + +#endif + diff --git a/frame/include/level0/bli_absq2s.h b/frame/include/level0/bli_absq2s.h index b6d7766df..9dcdad06f 100644 --- a/frame/include/level0/bli_absq2s.h +++ b/frame/include/level0/bli_absq2s.h @@ -41,27 +41,27 @@ // - The first char encodes the type of x. // - The second char encodes the type of a. -#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabsq2s( x, a ) bli_sabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabsq2s( x, a ) bli_sabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabsq2s( x, a ) bli_sabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabsq2s( x, a ) bli_dabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabsq2s( x, a ) bli_dabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabsq2s( x, a ) bli_dabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_scabsq2s( x, a ) bli_cabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabsq2s( x, a ) bli_cabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabsq2s( x, a ) bli_cabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabsq2s( x, a ) bli_zabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabsq2s( x, a ) bli_zabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabsq2s( x, a ) bli_zabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX diff --git a/frame/include/level0/bli_abval2s.h b/frame/include/level0/bli_abval2s.h index 7e0556940..6e0480790 100644 --- a/frame/include/level0/bli_abval2s.h +++ b/frame/include/level0/bli_abval2s.h @@ -43,25 +43,25 @@ #ifndef BLIS_ENABLE_C99_COMPLEX -#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) -#define bli_dsabval2s( x, a ) bli_sabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_csabval2s( x, a ) bli_sabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) -#define bli_zsabval2s( x, a ) bli_sabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) +#define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) +#define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) +#define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } +#define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } -#define bli_sdabval2s( x, a ) bli_dabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_cdabval2s( x, a ) bli_dabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) -#define bli_zdabval2s( x, a ) bli_dabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) +#define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) +#define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) +#define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } +#define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } -#define bli_scabval2s( x, a ) bli_cabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) -#define bli_dcabval2s( x, a ) bli_cabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_zcabval2s( x, a ) bli_cabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) +#define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) +#define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) -#define bli_szabval2s( x, a ) bli_zabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_dzabval2s( x, a ) bli_zabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_czabval2s( x, a ) bli_zabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) -#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) +#define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX diff --git a/frame/ind/bli_ind.c b/frame/ind/bli_ind.c index e715b2aad..f0aec685b 100644 --- a/frame/ind/bli_ind.c +++ b/frame/ind/bli_ind.c @@ -45,6 +45,7 @@ static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = /* 4mh */ "4mh", /* 4m1b */ "4m1b", /* 4m1a */ "4m1a", +/* 1m */ "1m", /* nat */ "native", }; @@ -56,10 +57,12 @@ void bli_ind_init( void ) if ( bli_ind_is_initialized() ) return; #ifdef BLIS_ENABLE_INDUCED_SCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_SCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX ); #endif #ifdef BLIS_ENABLE_INDUCED_DCOMPLEX - bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + //bli_ind_enable_dt( BLIS_4M1A, BLIS_DCOMPLEX ); + bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX ); #endif // Mark API as initialized. diff --git a/frame/ind/bli_ind.h b/frame/ind/bli_ind.h index b34941d91..e0ceb383b 100644 --- a/frame/ind/bli_ind.h +++ b/frame/ind/bli_ind.h @@ -44,6 +44,9 @@ // level-3 typed APIs #include "bli_l3_ind_tapi.h" +// level-3 misc. optimizations +#include "bli_l3_ind_opt.h" + // level-3 cntx initialization #include "bli_gemmind_cntx.h" #include "bli_trsmind_cntx.h" diff --git a/frame/ind/bli_l3_ind.c b/frame/ind/bli_l3_ind.c index e2d1a0f86..e694f5384 100644 --- a/frame/ind/bli_l3_ind.c +++ b/frame/ind/bli_l3_ind.c @@ -51,6 +51,8 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = NULL, NULL, NULL, NULL, NULL }, /* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1, bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 }, +/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m, + bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m }, /* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat, bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat }, }; @@ -76,6 +78,8 @@ static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, +/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, + {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, }; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index a484cf1a1..5b7a70c3c 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -34,23 +34,35 @@ #include "blis.h" -typedef void (*cntx_ft)( cntx_t* cntx ); +typedef void (*cntx_init_ft)( num_t dt, cntx_t* cntx ); +typedef void (*cntx_finalize_ft)( cntx_t* cntx ); -static void* bli_gemmind_cntx_fp[BLIS_NUM_IND_METHODS][2] = +static void* bli_gemmind_cntx_init_fp[BLIS_NUM_IND_METHODS] = { - /* _cntx_init _cntx_finalize */ -/* 3mh */ { bli_gemm3mh_cntx_init, bli_gemm3mh_cntx_finalize }, -/* 3m3 */ { bli_gemm3m3_cntx_init, bli_gemm3m3_cntx_finalize }, -/* 3m2 */ { bli_gemm3m2_cntx_init, bli_gemm3m2_cntx_finalize }, -/* 3m1 */ { bli_gemm3m1_cntx_init, bli_gemm3m1_cntx_finalize }, -/* 4mh */ { bli_gemm4mh_cntx_init, bli_gemm4mh_cntx_finalize }, -/* 4mb */ { bli_gemm4mb_cntx_init, bli_gemm4mb_cntx_finalize }, -/* 4m1 */ { bli_gemm4m1_cntx_init, bli_gemm4m1_cntx_finalize }, -/* nat */ { bli_gemmnat_cntx_init, bli_gemmnat_cntx_finalize } +/* 3mh */ bli_gemm3mh_cntx_init, +/* 3m3 */ bli_gemm3m3_cntx_init, +/* 3m2 */ bli_gemm3m2_cntx_init, +/* 3m1 */ bli_gemm3m1_cntx_init, +/* 4mh */ bli_gemm4mh_cntx_init, +/* 4mb */ bli_gemm4mb_cntx_init, +/* 4m1 */ bli_gemm4m1_cntx_init, +/* 1m */ bli_gemm1m_cntx_init, +/* nat */ bli_gemmnat_cntx_init +}; + +static void* bli_gemmind_cntx_finalize_fp[BLIS_NUM_IND_METHODS] = +{ +/* 3mh */ bli_gemm3mh_cntx_finalize, +/* 3m3 */ bli_gemm3m3_cntx_finalize, +/* 3m2 */ bli_gemm3m2_cntx_finalize, +/* 3m1 */ bli_gemm3m1_cntx_finalize, +/* 4mh */ bli_gemm4mh_cntx_finalize, +/* 4mb */ bli_gemm4mb_cntx_finalize, +/* 4m1 */ bli_gemm4m1_cntx_finalize, +/* 1m */ bli_gemm1m_cntx_finalize, +/* nat */ bli_gemmnat_cntx_finalize }; -#define BLIS_CNTX_INIT_INDEX 0 -#define BLIS_CNTX_FINALIZE_INDEX 1 // ----------------------------------------------------------------------------- @@ -62,7 +74,7 @@ void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ) { ind_t method = bli_ind_oper_find_avail( BLIS_GEMM, dt ); - bli_gemmind_cntx_init( method, cntx ); + bli_gemmind_cntx_init( method, dt, cntx ); } void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) @@ -77,16 +89,16 @@ void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ) // Execute the context initialization/finalization function associated // with a given induced method. -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ) +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_init_get_func( method ); + cntx_init_ft func = bli_gemmind_cntx_init_get_func( method ); - func( cntx ); + func( dt, cntx ); } void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) { - cntx_ft func = bli_gemmind_cntx_finalize_get_func( method ); + cntx_finalize_ft func = bli_gemmind_cntx_finalize_get_func( method ); func( cntx ); } @@ -95,17 +107,17 @@ void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ) void* bli_gemmind_cntx_init_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_INIT_INDEX ]; + return bli_gemmind_cntx_init_fp[ method ]; } void* bli_gemmind_cntx_finalize_get_func( ind_t method ) { - return bli_gemmind_cntx_fp[ method ][ BLIS_CNTX_FINALIZE_INDEX ]; + return bli_gemmind_cntx_finalize_fp[ method ]; } // ----------------------------------------------------------------------------- -void bli_gemm3m1_cntx_init( cntx_t* cntx ) +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -122,23 +134,25 @@ void bli_gemm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -151,7 +165,7 @@ void bli_gemm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m2_cntx_init( cntx_t* cntx ) +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M2; @@ -168,23 +182,25 @@ void bli_gemm3m2_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 3.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 3.0, 3.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -197,7 +213,7 @@ void bli_gemm3m2_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3m3_cntx_init( cntx_t* cntx ) +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M3; @@ -214,23 +230,25 @@ void bli_gemm3m3_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 3.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 3.0, 3.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - BLIS_PACKED_COL_PANELS_3MS, - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx ); } void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -238,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); } } @@ -256,7 +274,7 @@ void bli_gemm3m3_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm3mh_cntx_init( cntx_t* cntx ) +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3MH; @@ -273,23 +291,25 @@ void bli_gemm3mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -297,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx ); } } @@ -318,7 +338,7 @@ void bli_gemm3mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4m1_cntx_init( cntx_t* cntx ) +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -335,23 +355,25 @@ void bli_gemm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -364,7 +386,7 @@ void bli_gemm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mb_cntx_init( cntx_t* cntx ) +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1B; @@ -381,23 +403,25 @@ void bli_gemm4mb_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 2.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 2.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 2.0, 2.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -410,7 +434,7 @@ void bli_gemm4mb_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemm4mh_cntx_init( cntx_t* cntx ) +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4MH; @@ -427,23 +451,25 @@ void bli_gemm4mh_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 1.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 1.0, 1.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -451,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 3 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } } @@ -477,9 +503,126 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_gemmnat_cntx_init( cntx_t* cntx ) +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_gemm_cntx_init( cntx ); + // Default to context for block-panel algorithm. + bli_gemm1mbp_cntx_init( dt, cntx ); +} + +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, FALSE, cntx ); +} + +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, TRUE, cntx ); +} + +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) +{ + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernel associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + // Initialize the blocksizes according to the micro-kernel preference as + // well as the algorithm. + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // This branch is used for algorithms 1m_c_bp, 1m_r_pb. + + // Set the pack_t schemas for the c_bp or r_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // This branch is used for algorithms 1m_r_bp, 1m_c_pb. + + // Set the pack_t schemas for the r_bp or c_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + } + + // Set the anti-preference field to TRUE when executing a panel-block + // algorithm, and FALSE otherwise. This will cause higher-level generic + // code to establish (if needed) disagreement between the storage of C and + // the micro-kernel output preference so that the two will come back into + // agreement in the panel-block macro-kernel (which implemented in terms + // of the block-panel macro-kernel with some induced transpositions). + bli_cntx_set_anti_pref( is_pb, cntx ); +} + +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_gemm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm_cntx_init( dt, cntx ); } void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index c70da7b36..ea47968b1 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -32,67 +32,51 @@ */ -#if 0 -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on gemm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_stage)( dim_t stage, cntx_t* cntx ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( gemm, nat ) -GENPROT( gemm, 3mh ) -GENPROT( gemm, 3m3 ) -GENPROT( gemm, 3m2 ) -GENPROT( gemm, 3m1 ) -GENPROT( gemm, 4mh ) -GENPROT( gemm, 4mb ) -GENPROT( gemm, 4m1 ) -#endif - -void bli_gemmnat_cntx_init( cntx_t* cntx ); +void bli_gemmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemmnat_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemmnat_cntx_finalize( cntx_t* cntx ); -void bli_gemm3mh_cntx_init( cntx_t* cntx ); +void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m3_cntx_init( cntx_t* cntx ); +void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m3_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m2_cntx_init( cntx_t* cntx ); +void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m2_cntx_finalize( cntx_t* cntx ); -void bli_gemm3m1_cntx_init( cntx_t* cntx ); +void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm3m1_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mh_cntx_init( cntx_t* cntx ); +void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mh_cntx_finalize( cntx_t* cntx ); -void bli_gemm4mb_cntx_init( cntx_t* cntx ); +void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4mb_cntx_finalize( cntx_t* cntx ); -void bli_gemm4m1_cntx_init( cntx_t* cntx ); +void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); +void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ); +void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); +void bli_gemm1m_cntx_finalize( cntx_t* cntx ); + // ----------------------------------------------------------------------------- void bli_gemmind_cntx_init_avail( num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize_avail( num_t dt, cntx_t* cntx ); -void bli_gemmind_cntx_init( ind_t method, cntx_t* cntx ); +void bli_gemmind_cntx_init( ind_t method, num_t dt, cntx_t* cntx ); void bli_gemmind_cntx_finalize( ind_t method, cntx_t* cntx ); void* bli_gemmind_cntx_init_get_func( ind_t method ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 85212ba90..96f9add60 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -36,7 +36,7 @@ // ----------------------------------------------------------------------------- -void bli_trsm3m1_cntx_init( cntx_t* cntx ) +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_3M1; @@ -57,23 +57,25 @@ void bli_trsm3m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 3.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 3.0, 3.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -82,7 +84,7 @@ void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsm4m1_cntx_init( cntx_t* cntx ) +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) { const ind_t method = BLIS_4M1A; @@ -103,23 +105,25 @@ void bli_trsm4m1_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); // Initialize the context with packm-related kernels. - bli_packm_cntx_init( cntx ); + bli_packm_cntx_init( dt, cntx ); // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. - bli_gks_cntx_set_blkszs( method, 6, - BLIS_NC, BLIS_NR, 1.0, - BLIS_KC, BLIS_KR, 2.0, - BLIS_MC, BLIS_MR, 1.0, - BLIS_NR, BLIS_NR, 1.0, - BLIS_MR, BLIS_MR, 1.0, - BLIS_KR, BLIS_KR, 1.0, - cntx ); + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + // Set the pack_t schemas for the current induced method. + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -128,9 +132,90 @@ void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- -void bli_trsmnat_cntx_init( cntx_t* cntx ) +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) { - bli_trsm_cntx_init( cntx ); + const ind_t method = BLIS_1M; + + // Clear the context fields. + bli_cntx_obj_clear( cntx ); + + // Initialize the context with the current architecture's native + // level-3 gemm micro-kernel, and its output preferences. + bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); + + // Initialize the context with the virtual micro-kernels associated with + // the current induced method. + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMM_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_GEMMTRSM_U_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_L_UKR, cntx ); + bli_gks_cntx_set_l3_vir_ukr( method, BLIS_TRSM_U_UKR, cntx ); + + // Initialize the context with packm-related kernels. + bli_packm_cntx_init( dt, cntx ); + + if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 1.0, 1.0, + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 2.0, 2.0, // halve mc... + BLIS_NR, BLIS_NR, 1.0, 1.0, + BLIS_MR, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + // BLIS_PACKED_COL_PANELS_1R, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) + { + // Initialize the context with the current architecture's register + // and cache blocksizes (and multiples), and the induced method. + bli_gks_cntx_set_blkszs + ( + method, 6, + BLIS_NC, BLIS_NR, 2.0, 2.0, // halve nc... + BLIS_KC, BLIS_KR, 2.0, 2.0, // halve kc... + BLIS_MC, BLIS_MR, 1.0, 1.0, + BLIS_NR, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) + BLIS_MR, BLIS_MR, 1.0, 1.0, + BLIS_KR, BLIS_KR, 1.0, 1.0, + cntx + ); + + // Set the pack_t schemas for the current induced method. + //bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + // BLIS_PACKED_COL_PANELS_1E, + // cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); + } +} + +void bli_trsm1m_cntx_stage( dim_t stage, cntx_t* cntx ) +{ +} + +void bli_trsm1m_cntx_finalize( cntx_t* cntx ) +{ +} + +// ----------------------------------------------------------------------------- + +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_trsm_cntx_init( dt, cntx ); } void bli_trsmnat_cntx_finalize( cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_trsmind_cntx.h b/frame/ind/cntx/bli_trsmind_cntx.h index 3d3c883f9..49f7f0600 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.h +++ b/frame/ind/cntx/bli_trsmind_cntx.h @@ -32,29 +32,15 @@ */ -/* -// -// Generate prototypes for _cntx_init(), _cntx_stage(), and _cntx_finalize() -// for each induced method (including native execution) based on trsm. -// - -#undef GENPROT -#define GENPROT( opname, imeth ) \ -\ -void PASTEMAC2(opname,imeth,_cntx_init)( void ); \ -void PASTEMAC2(opname,imeth,_cntx_finalize)( void ); - -GENPROT( trsm, nat ) -GENPROT( trsm, 3m1 ) -GENPROT( trsm, 4m1 ) -*/ - -void bli_trsmnat_cntx_init( cntx_t* cntx ); +void bli_trsmnat_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsmnat_cntx_finalize( cntx_t* cntx ); -void bli_trsm4m1_cntx_init( cntx_t* cntx ); +void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm4m1_cntx_finalize( cntx_t* cntx ); -void bli_trsm3m1_cntx_init( cntx_t* cntx ); +void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ); void bli_trsm3m1_cntx_finalize( cntx_t* cntx ); +void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_trsm1m_cntx_finalize( cntx_t* cntx ); + diff --git a/frame/ind/include/bli_kernel_1m_macro_defs.h b/frame/ind/include/bli_kernel_1m_macro_defs.h new file mode 100644 index 000000000..4fc0ccb06 --- /dev/null +++ b/frame/ind/include/bli_kernel_1m_macro_defs.h @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1M_MACRO_DEFS_H +#define BLIS_KERNEL_1M_MACRO_DEFS_H + + +// -- Define row access bools -------------------------------------------------- + +// gemm4m1 micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM1M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + +// -- Define default 4m1-specific kernel names --------------------------------- + +// +// Level-3 +// + +// gemm4m1 micro-kernels + +#ifndef BLIS_CGEMM1M_UKERNEL +#define BLIS_CGEMM1M_UKERNEL BLIS_CGEMM1M_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM1M_UKERNEL +#define BLIS_ZGEMM1M_UKERNEL BLIS_ZGEMM1M_UKERNEL_REF +#endif + +// gemmtrsm4m1_l micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_L_UKERNEL +#define BLIS_CGEMMTRSM1M_L_UKERNEL BLIS_CGEMMTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_L_UKERNEL +#define BLIS_ZGEMMTRSM1M_L_UKERNEL BLIS_ZGEMMTRSM1M_L_UKERNEL_REF +#endif + +// gemmtrsm4m1_u micro-kernels + +#ifndef BLIS_CGEMMTRSM1M_U_UKERNEL +#define BLIS_CGEMMTRSM1M_U_UKERNEL BLIS_CGEMMTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM1M_U_UKERNEL +#define BLIS_ZGEMMTRSM1M_U_UKERNEL BLIS_ZGEMMTRSM1M_U_UKERNEL_REF +#endif + +// trsm4m1_l micro-kernels + +#ifndef BLIS_CTRSM1M_L_UKERNEL +#define BLIS_CTRSM1M_L_UKERNEL BLIS_CTRSM1M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_L_UKERNEL +#define BLIS_ZTRSM1M_L_UKERNEL BLIS_ZTRSM1M_L_UKERNEL_REF +#endif + +// trsm4m1_u micro-kernels + +#ifndef BLIS_CTRSM1M_U_UKERNEL +#define BLIS_CTRSM1M_U_UKERNEL BLIS_CTRSM1M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM1M_U_UKERNEL +#define BLIS_ZTRSM1M_U_UKERNEL BLIS_ZTRSM1M_U_UKERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_kernel_ind_macro_defs.h b/frame/ind/include/bli_kernel_ind_macro_defs.h index 7f43857f0..55eeb010b 100644 --- a/frame/ind/include/bli_kernel_ind_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_macro_defs.h @@ -41,9 +41,11 @@ #include "bli_kernel_4mh_macro_defs.h" #include "bli_kernel_4mb_macro_defs.h" #include "bli_kernel_4m1_macro_defs.h" +#include "bli_kernel_1m_macro_defs.h" // Storage format headers #include "bli_packm_3mis_macro_defs.h" #include "bli_packm_4mi_macro_defs.h" #include "bli_packm_rih_macro_defs.h" +#include "bli_packm_1er_macro_defs.h" diff --git a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h index b6020489e..47fbb4a28 100644 --- a/frame/ind/include/bli_kernel_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_kernel_ind_pre_macro_defs.h @@ -140,6 +140,35 @@ #define BLIS_CTRSM4M1_U_UKERNEL_REF bli_ctrsm4m1_u_ukr_ref #define BLIS_ZTRSM4M1_U_UKERNEL_REF bli_ztrsm4m1_u_ukr_ref +// +// Level-3 1m +// + +// gemm1m micro-kernels + +#define BLIS_CGEMM1M_UKERNEL_REF bli_cgemm1m_ukr_ref +#define BLIS_ZGEMM1M_UKERNEL_REF bli_zgemm1m_ukr_ref + +// gemmtrsm1m_l micro-kernels + +#define BLIS_CGEMMTRSM1M_L_UKERNEL_REF bli_cgemmtrsm1m_l_ukr_ref +#define BLIS_ZGEMMTRSM1M_L_UKERNEL_REF bli_zgemmtrsm1m_l_ukr_ref + +// gemmtrsm1m_u micro-kernels + +#define BLIS_CGEMMTRSM1M_U_UKERNEL_REF bli_cgemmtrsm1m_u_ukr_ref +#define BLIS_ZGEMMTRSM1M_U_UKERNEL_REF bli_zgemmtrsm1m_u_ukr_ref + +// trsm1m_l micro-kernels + +#define BLIS_CTRSM1M_L_UKERNEL_REF bli_ctrsm1m_l_ukr_ref +#define BLIS_ZTRSM1M_L_UKERNEL_REF bli_ztrsm1m_l_ukr_ref + +// trsm1m_u micro-kernels + +#define BLIS_CTRSM1M_U_UKERNEL_REF bli_ctrsm1m_u_ukr_ref +#define BLIS_ZTRSM1M_U_UKERNEL_REF bli_ztrsm1m_u_ukr_ref + #endif diff --git a/frame/ind/include/bli_packm_1er_macro_defs.h b/frame/ind/include/bli_packm_1er_macro_defs.h new file mode 100644 index 000000000..fe550d1c5 --- /dev/null +++ b/frame/ind/include/bli_packm_1er_macro_defs.h @@ -0,0 +1,241 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_1ER_MACRO_DEFS_H +#define BLIS_KERNEL_1ER_MACRO_DEFS_H + + +// -- Define default 1e/1r-specific kernel names ------------------------------- + +// +// 1e +// + +// packm_2xk_1e kernels + +#ifndef BLIS_CPACKM_2XK_1E_KERNEL +#define BLIS_CPACKM_2XK_1E_KERNEL BLIS_CPACKM_2XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1E_KERNEL +#define BLIS_ZPACKM_2XK_1E_KERNEL BLIS_ZPACKM_2XK_1E_KERNEL_REF +#endif + +// packm_4xk_1e kernels + +#ifndef BLIS_CPACKM_4XK_1E_KERNEL +#define BLIS_CPACKM_4XK_1E_KERNEL BLIS_CPACKM_4XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1E_KERNEL +#define BLIS_ZPACKM_4XK_1E_KERNEL BLIS_ZPACKM_4XK_1E_KERNEL_REF +#endif + +// packm_6xk_1e kernels + +#ifndef BLIS_CPACKM_6XK_1E_KERNEL +#define BLIS_CPACKM_6XK_1E_KERNEL BLIS_CPACKM_6XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1E_KERNEL +#define BLIS_ZPACKM_6XK_1E_KERNEL BLIS_ZPACKM_6XK_1E_KERNEL_REF +#endif + +// packm_8xk_1e kernels + +#ifndef BLIS_CPACKM_8XK_1E_KERNEL +#define BLIS_CPACKM_8XK_1E_KERNEL BLIS_CPACKM_8XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1E_KERNEL +#define BLIS_ZPACKM_8XK_1E_KERNEL BLIS_ZPACKM_8XK_1E_KERNEL_REF +#endif + +// packm_10xk_1e kernels + +#ifndef BLIS_CPACKM_10XK_1E_KERNEL +#define BLIS_CPACKM_10XK_1E_KERNEL BLIS_CPACKM_10XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1E_KERNEL +#define BLIS_ZPACKM_10XK_1E_KERNEL BLIS_ZPACKM_10XK_1E_KERNEL_REF +#endif + +// packm_12xk_1e kernels + +#ifndef BLIS_CPACKM_12XK_1E_KERNEL +#define BLIS_CPACKM_12XK_1E_KERNEL BLIS_CPACKM_12XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1E_KERNEL +#define BLIS_ZPACKM_12XK_1E_KERNEL BLIS_ZPACKM_12XK_1E_KERNEL_REF +#endif + +// packm_14xk_1e kernels + +#ifndef BLIS_CPACKM_14XK_1E_KERNEL +#define BLIS_CPACKM_14XK_1E_KERNEL BLIS_CPACKM_14XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1E_KERNEL +#define BLIS_ZPACKM_14XK_1E_KERNEL BLIS_ZPACKM_14XK_1E_KERNEL_REF +#endif + +// packm_16xk_1e kernels + +#ifndef BLIS_CPACKM_16XK_1E_KERNEL +#define BLIS_CPACKM_16XK_1E_KERNEL BLIS_CPACKM_16XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1E_KERNEL +#define BLIS_ZPACKM_16XK_1E_KERNEL BLIS_ZPACKM_16XK_1E_KERNEL_REF +#endif + +// packm_30xk_1e kernels + +#ifndef BLIS_CPACKM_30XK_1E_KERNEL +#define BLIS_CPACKM_30XK_1E_KERNEL BLIS_CPACKM_30XK_1E_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1E_KERNEL +#define BLIS_ZPACKM_30XK_1E_KERNEL BLIS_ZPACKM_30XK_1E_KERNEL_REF +#endif + +// +// 1r +// + +// packm_2xk_1r kernels + +#ifndef BLIS_CPACKM_2XK_1R_KERNEL +#define BLIS_CPACKM_2XK_1R_KERNEL BLIS_CPACKM_2XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_1R_KERNEL +#define BLIS_ZPACKM_2XK_1R_KERNEL BLIS_ZPACKM_2XK_1R_KERNEL_REF +#endif + +// packm_3xk_1r kernels + +#ifndef BLIS_CPACKM_3XK_1R_KERNEL +#define BLIS_CPACKM_3XK_1R_KERNEL BLIS_CPACKM_3XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_3XK_1R_KERNEL +#define BLIS_ZPACKM_3XK_1R_KERNEL BLIS_ZPACKM_3XK_1R_KERNEL_REF +#endif + +// packm_4xk_1r kernels + +#ifndef BLIS_CPACKM_4XK_1R_KERNEL +#define BLIS_CPACKM_4XK_1R_KERNEL BLIS_CPACKM_4XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_1R_KERNEL +#define BLIS_ZPACKM_4XK_1R_KERNEL BLIS_ZPACKM_4XK_1R_KERNEL_REF +#endif + +// packm_6xk_1r kernels + +#ifndef BLIS_CPACKM_6XK_1R_KERNEL +#define BLIS_CPACKM_6XK_1R_KERNEL BLIS_CPACKM_6XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_1R_KERNEL +#define BLIS_ZPACKM_6XK_1R_KERNEL BLIS_ZPACKM_6XK_1R_KERNEL_REF +#endif + +// packm_8xk_1r kernels + +#ifndef BLIS_CPACKM_8XK_1R_KERNEL +#define BLIS_CPACKM_8XK_1R_KERNEL BLIS_CPACKM_8XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_1R_KERNEL +#define BLIS_ZPACKM_8XK_1R_KERNEL BLIS_ZPACKM_8XK_1R_KERNEL_REF +#endif + +// packm_10xk_1r kernels + +#ifndef BLIS_CPACKM_10XK_1R_KERNEL +#define BLIS_CPACKM_10XK_1R_KERNEL BLIS_CPACKM_10XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_1R_KERNEL +#define BLIS_ZPACKM_10XK_1R_KERNEL BLIS_ZPACKM_10XK_1R_KERNEL_REF +#endif + +// packm_12xk_1r kernels + +#ifndef BLIS_CPACKM_12XK_1R_KERNEL +#define BLIS_CPACKM_12XK_1R_KERNEL BLIS_CPACKM_12XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_1R_KERNEL +#define BLIS_ZPACKM_12XK_1R_KERNEL BLIS_ZPACKM_12XK_1R_KERNEL_REF +#endif + +// packm_14xk_1r kernels + +#ifndef BLIS_CPACKM_14XK_1R_KERNEL +#define BLIS_CPACKM_14XK_1R_KERNEL BLIS_CPACKM_14XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_1R_KERNEL +#define BLIS_ZPACKM_14XK_1R_KERNEL BLIS_ZPACKM_14XK_1R_KERNEL_REF +#endif + +// packm_16xk_1r kernels + +#ifndef BLIS_CPACKM_16XK_1R_KERNEL +#define BLIS_CPACKM_16XK_1R_KERNEL BLIS_CPACKM_16XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_1R_KERNEL +#define BLIS_ZPACKM_16XK_1R_KERNEL BLIS_ZPACKM_16XK_1R_KERNEL_REF +#endif + +// packm_30xk_1r kernels + +#ifndef BLIS_CPACKM_30XK_1R_KERNEL +#define BLIS_CPACKM_30XK_1R_KERNEL BLIS_CPACKM_30XK_1R_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_30XK_1R_KERNEL +#define BLIS_ZPACKM_30XK_1R_KERNEL BLIS_ZPACKM_30XK_1R_KERNEL_REF +#endif + + + +#endif diff --git a/frame/ind/include/bli_packm_3mis_macro_defs.h b/frame/ind/include/bli_packm_3mis_macro_defs.h index 3abe40218..654172467 100644 --- a/frame/ind/include/bli_packm_3mis_macro_defs.h +++ b/frame/ind/include/bli_packm_3mis_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 3mis-specific kernel names -------------------------------- -// -// Level-1m -// // packm_2xk_3mis kernels diff --git a/frame/ind/include/bli_packm_4mi_macro_defs.h b/frame/ind/include/bli_packm_4mi_macro_defs.h index 2f36de349..f5a617737 100644 --- a/frame/ind/include/bli_packm_4mi_macro_defs.h +++ b/frame/ind/include/bli_packm_4mi_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default 4mi-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_4mi kernels diff --git a/frame/ind/include/bli_packm_ind_pre_macro_defs.h b/frame/ind/include/bli_packm_ind_pre_macro_defs.h index ee5070e49..1bec1c5fd 100644 --- a/frame/ind/include/bli_packm_ind_pre_macro_defs.h +++ b/frame/ind/include/bli_packm_ind_pre_macro_defs.h @@ -177,5 +177,102 @@ +// packm_2xk_1e kernels + +#define BLIS_CPACKM_2XK_1E_KERNEL_REF bli_cpackm_2xk_1e_ref +#define BLIS_ZPACKM_2XK_1E_KERNEL_REF bli_zpackm_2xk_1e_ref + +// packm_4xk_1e kernels + +#define BLIS_CPACKM_4XK_1E_KERNEL_REF bli_cpackm_4xk_1e_ref +#define BLIS_ZPACKM_4XK_1E_KERNEL_REF bli_zpackm_4xk_1e_ref + +// packm_6xk_1e kernels + +#define BLIS_CPACKM_6XK_1E_KERNEL_REF bli_cpackm_6xk_1e_ref +#define BLIS_ZPACKM_6XK_1E_KERNEL_REF bli_zpackm_6xk_1e_ref + +// packm_8xk_1e kernels + +#define BLIS_CPACKM_8XK_1E_KERNEL_REF bli_cpackm_8xk_1e_ref +#define BLIS_ZPACKM_8XK_1E_KERNEL_REF bli_zpackm_8xk_1e_ref + +// packm_10xk_1e kernels + +#define BLIS_CPACKM_10XK_1E_KERNEL_REF bli_cpackm_10xk_1e_ref +#define BLIS_ZPACKM_10XK_1E_KERNEL_REF bli_zpackm_10xk_1e_ref + +// packm_12xk_1e kernels + +#define BLIS_CPACKM_12XK_1E_KERNEL_REF bli_cpackm_12xk_1e_ref +#define BLIS_ZPACKM_12XK_1E_KERNEL_REF bli_zpackm_12xk_1e_ref + +// packm_14xk_1e kernels + +#define BLIS_CPACKM_14XK_1E_KERNEL_REF bli_cpackm_14xk_1e_ref +#define BLIS_ZPACKM_14XK_1E_KERNEL_REF bli_zpackm_14xk_1e_ref + +// packm_16xk_1e kernels + +#define BLIS_CPACKM_16XK_1E_KERNEL_REF bli_cpackm_16xk_1e_ref +#define BLIS_ZPACKM_16XK_1E_KERNEL_REF bli_zpackm_16xk_1e_ref + +// packm_30xk_1e kernels + +#define BLIS_CPACKM_30XK_1E_KERNEL_REF bli_cpackm_30xk_1e_ref +#define BLIS_ZPACKM_30XK_1E_KERNEL_REF bli_zpackm_30xk_1e_ref + +// packm_2xk_1r kernels + +#define BLIS_CPACKM_2XK_1R_KERNEL_REF bli_cpackm_2xk_1r_ref +#define BLIS_ZPACKM_2XK_1R_KERNEL_REF bli_zpackm_2xk_1r_ref + +// packm_3xk_1r kernels + +#define BLIS_CPACKM_3XK_1R_KERNEL_REF bli_cpackm_3xk_1r_ref +#define BLIS_ZPACKM_3XK_1R_KERNEL_REF bli_zpackm_3xk_1r_ref + +// packm_4xk_1r kernels + +#define BLIS_CPACKM_4XK_1R_KERNEL_REF bli_cpackm_4xk_1r_ref +#define BLIS_ZPACKM_4XK_1R_KERNEL_REF bli_zpackm_4xk_1r_ref + +// packm_6xk_1r kernels + +#define BLIS_CPACKM_6XK_1R_KERNEL_REF bli_cpackm_6xk_1r_ref +#define BLIS_ZPACKM_6XK_1R_KERNEL_REF bli_zpackm_6xk_1r_ref + +// packm_8xk_1r kernels + +#define BLIS_CPACKM_8XK_1R_KERNEL_REF bli_cpackm_8xk_1r_ref +#define BLIS_ZPACKM_8XK_1R_KERNEL_REF bli_zpackm_8xk_1r_ref + +// packm_10xk_1r kernels + +#define BLIS_CPACKM_10XK_1R_KERNEL_REF bli_cpackm_10xk_1r_ref +#define BLIS_ZPACKM_10XK_1R_KERNEL_REF bli_zpackm_10xk_1r_ref + +// packm_12xk_1r kernels + +#define BLIS_CPACKM_12XK_1R_KERNEL_REF bli_cpackm_12xk_1r_ref +#define BLIS_ZPACKM_12XK_1R_KERNEL_REF bli_zpackm_12xk_1r_ref + +// packm_14xk_1r kernels + +#define BLIS_CPACKM_14XK_1R_KERNEL_REF bli_cpackm_14xk_1r_ref +#define BLIS_ZPACKM_14XK_1R_KERNEL_REF bli_zpackm_14xk_1r_ref + +// packm_16xk_1r kernels + +#define BLIS_CPACKM_16XK_1R_KERNEL_REF bli_cpackm_16xk_1r_ref +#define BLIS_ZPACKM_16XK_1R_KERNEL_REF bli_zpackm_16xk_1r_ref + +// packm_30xk_1r kernels + +#define BLIS_CPACKM_30XK_1R_KERNEL_REF bli_cpackm_30xk_1r_ref +#define BLIS_ZPACKM_30XK_1R_KERNEL_REF bli_zpackm_30xk_1r_ref + + + #endif diff --git a/frame/ind/include/bli_packm_rih_macro_defs.h b/frame/ind/include/bli_packm_rih_macro_defs.h index 543d197a0..c5c883e7d 100644 --- a/frame/ind/include/bli_packm_rih_macro_defs.h +++ b/frame/ind/include/bli_packm_rih_macro_defs.h @@ -38,9 +38,6 @@ // -- Define default rih-specific kernel names --------------------------------- -// -// Level-1m -// // packm_2xk_rih kernels diff --git a/frame/ind/misc/bli_l3_ind_opt.h b/frame/ind/misc/bli_l3_ind_opt.h new file mode 100644 index 000000000..6a0be1885 --- /dev/null +++ b/frame/ind/misc/bli_l3_ind_opt.h @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_IND_OPT_H +#define BLIS_L3_IND_OPT_H + +#define bli_l3_ind_recast_1m_params( dt_exec, schema_a, c, \ + m, n, k, \ + pd_a, ps_a, \ + pd_b, ps_b, \ + rs_c, cs_c ) \ +{ \ + obj_t beta; \ +\ + /* Detach the beta scalar from c so that we can test its imaginary + component. */ \ + bli_obj_scalar_detach( c, &beta ); \ +\ + /* If beta is in the real domain, and c is row- or column-stored, + then we may proceed with the optimization. */ \ + if ( bli_obj_imag_equals( &beta, &BLIS_ZERO ) && \ + !bli_is_gen_stored( rs_c, cs_c ) ) \ + { \ + dt_exec = bli_datatype_proj_to_real( dt_exec ); \ +\ + if ( bli_is_1e_packed( schema_a ) ) \ + { \ + m *= 2; \ + n *= 1; \ + k *= 2; \ + pd_a *= 2; ps_a *= 2; \ + pd_b *= 1; ps_b *= 2; \ + rs_c *= 1; cs_c *= 2; \ + } \ + else /* if ( bli_is_1r_packed( schema_a ) ) */ \ + { \ + m *= 1; \ + n *= 2; \ + k *= 2; \ + pd_a *= 1; ps_a *= 2; \ + pd_b *= 2; ps_b *= 2; \ + rs_c *= 2; cs_c *= 1; \ + } \ + } \ +} + +#endif diff --git a/frame/ind/oapi/bli_l3_1mbppb_oapi.c b/frame/ind/oapi/bli_l3_1mbppb_oapi.c new file mode 100644 index 000000000..e91f27ea2 --- /dev/null +++ b/frame/ind/oapi/bli_l3_1mbppb_oapi.c @@ -0,0 +1,85 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// -- gemmbp/gemmpb ------------------------------------------------------------ + +#undef GENFRONT +#define GENFRONT( opname, imeth, alg ) \ +\ +void PASTEMAC2(opname,imeth,alg) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *c ); \ + cntx_t cntx; \ + cntl_t* cntl_p; \ +\ + /* If the objects are in the real domain, execute the native + implementation. */ \ + if ( bli_obj_is_real( *c ) ) \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \ + return; \ + } \ +\ + /* Initialize a local 1m context for the current algorithm (bp or pb). */ \ + PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \ +\ + /* Create a control tree for the current algorithm (bp or pb). */ \ + cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \ +\ + /* Invoke the operation's front end using the context and control + tree we just created. */ \ + PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \ +\ + /* Free the control tree. Since the implementation will only make + copies of it (and not use it directly) we do not need to supply + a thread object. */ \ + bli_cntl_free( cntl_p, NULL ); \ +\ + /* Finalize the local context. */ \ + PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \ +} + +// gemm +GENFRONT( gemm, 1m, bp ) +GENFRONT( gemm, 1m, pb ) + diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c similarity index 89% rename from frame/ind/oapi/bli_l3_3m4m_oapi.c rename to frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 40348e627..b99ebda39 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -49,10 +49,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -61,9 +62,24 @@ void PASTEMAC(opname,imeth) \ PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ return; \ } \ +\ + /* A temporary hack to easily specify the 1m algorithm (block-panel or + panel-block). */ \ +/* + if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ + { \ + bli_gemm1mbp( alpha, a, b, beta, c ); \ + return; \ + } \ + else if ( PASTEMAC(opname,imeth) == bli_gemm3m1 ) \ + { \ + bli_gemm1mpb( alpha, a, b, beta, c ); \ + return; \ + } \ +*/ \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -92,6 +108,7 @@ GENFRONT( gemm, gemm, 3m1, 1 ) GENFRONT( gemm, gemm, 4mh, 4 ) GENFRONT( gemm, gemm, 4mb, 1 ) GENFRONT( gemm, gemm, 4m1, 1 ) +GENFRONT( gemm, gemm, 1m, 1 ) // her2k GENFRONT( her2k, gemm, 3mh, 3 ) @@ -101,6 +118,7 @@ GENFRONT( her2k, gemm, 3m1, 1 ) GENFRONT( her2k, gemm, 4mh, 4 ) //GENFRONT( her2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( her2k, gemm, 4m1, 1 ) +GENFRONT( her2k, gemm, 1m, 1 ) // syr2k GENFRONT( syr2k, gemm, 3mh, 3 ) @@ -110,6 +128,7 @@ GENFRONT( syr2k, gemm, 3m1, 1 ) GENFRONT( syr2k, gemm, 4mh, 4 ) //GENFRONT( syr2k, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syr2k, gemm, 4m1, 1 ) +GENFRONT( syr2k, gemm, 1m, 1 ) // -- hemm/symm/trmm3 ---------------------------------------------------------- @@ -128,10 +147,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -142,7 +162,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -171,6 +191,7 @@ GENFRONT( hemm, gemm, 3m1, 1 ) GENFRONT( hemm, gemm, 4mh, 4 ) //GENFRONT( hemm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( hemm, gemm, 4m1, 1 ) +GENFRONT( hemm, gemm, 1m, 1 ) // symm GENFRONT( symm, gemm, 3mh, 3 ) @@ -180,6 +201,7 @@ GENFRONT( symm, gemm, 3m1, 1 ) GENFRONT( symm, gemm, 4mh, 4 ) //GENFRONT( symm, gemm, 4mb, 1 ) // Not implemented. GENFRONT( symm, gemm, 4m1, 1 ) +GENFRONT( symm, gemm, 1m, 1 ) // trmm3 GENFRONT( trmm3, gemm, 3mh, 3 ) @@ -189,6 +211,7 @@ GENFRONT( trmm3, gemm, 3m1, 1 ) GENFRONT( trmm3, gemm, 4mh, 4 ) //GENFRONT( trmm3, gemm, 4mb, 1 ) // Not implemented. GENFRONT( trmm3, gemm, 4m1, 1 ) +GENFRONT( trmm3, gemm, 1m, 1 ) // -- herk/syrk ---------------------------------------------------------------- @@ -205,10 +228,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ + obj_t* beta_use = beta; \ +\ cntx_t* cntx_p; \ dim_t i; \ -\ - obj_t* beta_use = beta; \ \ /* If the objects are in the real domain, execute the native implementation. */ \ @@ -219,7 +243,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -248,6 +272,7 @@ GENFRONT( herk, gemm, 3m1, 1 ) GENFRONT( herk, gemm, 4mh, 4 ) //GENFRONT( herk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( herk, gemm, 4m1, 1 ) +GENFRONT( herk, gemm, 1m, 1 ) // syrk GENFRONT( syrk, gemm, 3mh, 3 ) @@ -257,6 +282,7 @@ GENFRONT( syrk, gemm, 3m1, 1 ) GENFRONT( syrk, gemm, 4mh, 4 ) //GENFRONT( syrk, gemm, 4mb, 1 ) // Not implemented. GENFRONT( syrk, gemm, 4m1, 1 ) +GENFRONT( syrk, gemm, 1m, 1 ) // -- trmm --------------------------------------------------------------------- @@ -273,6 +299,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ dim_t i; \ \ @@ -285,7 +313,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -310,6 +338,7 @@ GENFRONT( trmm, gemm, 3m1, 1 ) //GENFRONT( trmm, gemm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, gemm, 4mb, 1 ) // Unimplementable. GENFRONT( trmm, gemm, 4m1, 1 ) +GENFRONT( trmm, gemm, 1m, 1 ) // -- trsm --------------------------------------------------------------------- @@ -326,6 +355,8 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ +\ cntx_t* cntx_p; \ \ /* If the objects are in the real domain, execute the native @@ -337,7 +368,7 @@ void PASTEMAC(opname,imeth) \ } \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ { \ /* NOTE: trsm cannot be implemented via any induced method that @@ -360,4 +391,5 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mh, 4 ) // Unimplementable. //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) +GENFRONT( trsm, trsm, 1m, 1 ) diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 62fa794fa..f5907d414 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -55,6 +55,7 @@ GENPROT( nat ) GENPROT( ind ) GENPROT( 3m1 ) GENPROT( 4m1 ) +GENPROT( 1m ) // @@ -79,3 +80,17 @@ GENPROT_NO2OP( 3m2 ) GENPROT_NO2OP( 4mh ) GENPROT_NO2OP( 4mb ) + +// +// Generate object-based prototypes for 1m methods that specify an algorithm +// (e.g., block-panel or panel-block). +// + +#undef GENPROT +#define GENPROT( imeth, alg ) \ +\ +void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \ + +GENPROT( 1m, bp ) +GENPROT( 1m, pb ) + diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 68b664d65..c783714fe 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -55,10 +55,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -92,10 +93,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -127,10 +129,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *c ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -161,10 +164,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ @@ -194,10 +198,11 @@ void PASTEMAC(opname,imeth) \ cntx_t* cntx \ ) \ { \ + num_t dt = bli_obj_datatype( *b ); \ cntx_t* cntx_p; \ \ /* Initialize a local context if the one provided is NULL. */ \ - bli_cntx_init_local_if2( cname, imeth, cntx, cntx_p ); \ + bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ \ /* Invoke the operation's front end with the appropriate control tree. */ \ diff --git a/frame/ind/tapi/bli_l3_ind_tapi.c b/frame/ind/tapi/bli_l3_ind_tapi.c index 1c4ba3ba9..d4425b5f6 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.c +++ b/frame/ind/tapi/bli_l3_ind_tapi.c @@ -90,6 +90,7 @@ INSERT_GENTFUNC_BASIC0( gemm3m1 ) INSERT_GENTFUNC_BASIC0( gemm4mh ) INSERT_GENTFUNC_BASIC0( gemm4mb ) INSERT_GENTFUNC_BASIC0( gemm4m1 ) +INSERT_GENTFUNC_BASIC0( gemm1m ) // -- hemm --------------------------------------------------------------------- @@ -149,6 +150,7 @@ INSERT_GENTFUNC_BASIC0( hemm3mh ) INSERT_GENTFUNC_BASIC0( hemm3m1 ) INSERT_GENTFUNC_BASIC0( hemm4mh ) INSERT_GENTFUNC_BASIC0( hemm4m1 ) +INSERT_GENTFUNC_BASIC0( hemm1m ) // -- herk --------------------------------------------------------------------- @@ -200,6 +202,7 @@ INSERT_GENTFUNCR_BASIC0( herk3mh ) INSERT_GENTFUNCR_BASIC0( herk3m1 ) INSERT_GENTFUNCR_BASIC0( herk4mh ) INSERT_GENTFUNCR_BASIC0( herk4m1 ) +INSERT_GENTFUNCR_BASIC0( herk1m ) // -- her2k -------------------------------------------------------------------- @@ -258,6 +261,7 @@ INSERT_GENTFUNCR_BASIC0( her2k3mh ) INSERT_GENTFUNCR_BASIC0( her2k3m1 ) INSERT_GENTFUNCR_BASIC0( her2k4mh ) INSERT_GENTFUNCR_BASIC0( her2k4m1 ) +INSERT_GENTFUNCR_BASIC0( her2k1m ) // -- symm --------------------------------------------------------------------- @@ -317,6 +321,7 @@ INSERT_GENTFUNC_BASIC0( symm3mh ) INSERT_GENTFUNC_BASIC0( symm3m1 ) INSERT_GENTFUNC_BASIC0( symm4mh ) INSERT_GENTFUNC_BASIC0( symm4m1 ) +INSERT_GENTFUNC_BASIC0( symm1m ) // -- syrk --------------------------------------------------------------------- @@ -367,6 +372,7 @@ INSERT_GENTFUNC_BASIC0( syrk3mh ) INSERT_GENTFUNC_BASIC0( syrk3m1 ) INSERT_GENTFUNC_BASIC0( syrk4mh ) INSERT_GENTFUNC_BASIC0( syrk4m1 ) +INSERT_GENTFUNC_BASIC0( syrk1m ) // -- syr2k -------------------------------------------------------------------- @@ -424,6 +430,7 @@ INSERT_GENTFUNC_BASIC0( syr2k3mh ) INSERT_GENTFUNC_BASIC0( syr2k3m1 ) INSERT_GENTFUNC_BASIC0( syr2k4mh ) INSERT_GENTFUNC_BASIC0( syr2k4m1 ) +INSERT_GENTFUNC_BASIC0( syr2k1m ) // -- trmm3 -------------------------------------------------------------------- @@ -485,6 +492,7 @@ INSERT_GENTFUNC_BASIC0( trmm33mh ) INSERT_GENTFUNC_BASIC0( trmm33m1 ) INSERT_GENTFUNC_BASIC0( trmm34mh ) INSERT_GENTFUNC_BASIC0( trmm34m1 ) +INSERT_GENTFUNC_BASIC0( trmm31m ) // -- trmm --------------------------------------------------------------------- @@ -534,6 +542,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trmm3m1 ) INSERT_GENTFUNC_BASIC0( trmm4m1 ) +INSERT_GENTFUNC_BASIC0( trmm1m ) // -- trsm --------------------------------------------------------------------- @@ -583,4 +592,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNC_BASIC0( trsm3m1 ) INSERT_GENTFUNC_BASIC0( trsm4m1 ) +INSERT_GENTFUNC_BASIC0( trsm1m ) diff --git a/frame/ind/tapi/bli_l3_ind_tapi.h b/frame/ind/tapi/bli_l3_ind_tapi.h index 029166c6c..7aa886b3d 100644 --- a/frame/ind/tapi/bli_l3_ind_tapi.h +++ b/frame/ind/tapi/bli_l3_ind_tapi.h @@ -58,6 +58,7 @@ INSERT_GENTPROT_BASIC( gemm3m1 ) INSERT_GENTPROT_BASIC( gemm4mh ) INSERT_GENTPROT_BASIC( gemm4mb ) INSERT_GENTPROT_BASIC( gemm4m1 ) +INSERT_GENTPROT_BASIC( gemm1m ) #undef GENTPROT @@ -83,6 +84,7 @@ INSERT_GENTPROT_BASIC( hemm3mh ) INSERT_GENTPROT_BASIC( hemm3m1 ) INSERT_GENTPROT_BASIC( hemm4mh ) INSERT_GENTPROT_BASIC( hemm4m1 ) +INSERT_GENTPROT_BASIC( hemm1m ) #undef GENTPROTR @@ -107,6 +109,7 @@ INSERT_GENTPROTR_BASIC( her2k3mh ) INSERT_GENTPROTR_BASIC( her2k3m1 ) INSERT_GENTPROTR_BASIC( her2k4mh ) INSERT_GENTPROTR_BASIC( her2k4m1 ) +INSERT_GENTPROTR_BASIC( her2k1m ) #undef GENTPROTR @@ -129,6 +132,7 @@ INSERT_GENTPROTR_BASIC( herk3mh ) INSERT_GENTPROTR_BASIC( herk3m1 ) INSERT_GENTPROTR_BASIC( herk4mh ) INSERT_GENTPROTR_BASIC( herk4m1 ) +INSERT_GENTPROTR_BASIC( herk1m ) #undef GENTPROT @@ -154,6 +158,7 @@ INSERT_GENTPROT_BASIC( symm3mh ) INSERT_GENTPROT_BASIC( symm3m1 ) INSERT_GENTPROT_BASIC( symm4mh ) INSERT_GENTPROT_BASIC( symm4m1 ) +INSERT_GENTPROT_BASIC( symm1m ) #undef GENTPROT @@ -178,6 +183,7 @@ INSERT_GENTPROT_BASIC( syr2k3mh ) INSERT_GENTPROT_BASIC( syr2k3m1 ) INSERT_GENTPROT_BASIC( syr2k4mh ) INSERT_GENTPROT_BASIC( syr2k4m1 ) +INSERT_GENTPROT_BASIC( syr2k1m ) #undef GENTPROT @@ -200,6 +206,7 @@ INSERT_GENTPROT_BASIC( syrk3mh ) INSERT_GENTPROT_BASIC( syrk3m1 ) INSERT_GENTPROT_BASIC( syrk4mh ) INSERT_GENTPROT_BASIC( syrk4m1 ) +INSERT_GENTPROT_BASIC( syrk1m ) #undef GENTPROT @@ -226,6 +233,7 @@ INSERT_GENTPROT_BASIC( trmm33mh ) INSERT_GENTPROT_BASIC( trmm33m1 ) INSERT_GENTPROT_BASIC( trmm34mh ) INSERT_GENTPROT_BASIC( trmm34m1 ) +INSERT_GENTPROT_BASIC( trmm31m ) #undef GENTPROT @@ -247,6 +255,7 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trmm3m1 ) INSERT_GENTPROT_BASIC( trmm4m1 ) +INSERT_GENTPROT_BASIC( trmm1m ) #undef GENTPROT @@ -268,4 +277,5 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROT_BASIC( trsm3m1 ) INSERT_GENTPROT_BASIC( trsm4m1 ) +INSERT_GENTPROT_BASIC( trsm1m ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c new file mode 100644 index 000000000..6279ab762 --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -0,0 +1,202 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const bool_t row_pref = !col_pref; \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should ONLY occur in + the context of trsm, whereby this virtual micro-kernel is called + directly from the trsm macro-kernel to update the micro-tile b11 + that exists within the packed row-panel of B. Indeed that is the + reason those cases MUST be explicitly handled. */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ + else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* In the atypical cases, we compute the result into temporary + workspace ct and then accumulated it back to c at the end. */ \ +\ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ + dim_t i, j; \ +\ + /* Accumulate the final result in ct back to c. */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ + else \ + { \ + /* In the typical cases, we use the real part of beta and + accumulate directly into the output matrix c. */ \ +\ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev new file mode 100644 index 000000000..3760bdd7c --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev @@ -0,0 +1,188 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /*const bool_t row_pref = !col_pref;*/ \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h index d7d5a258f..9b2dd5e5a 100644 --- a/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h +++ b/frame/ind/ukernels/gemm/bli_gemmind_ukr_ref.h @@ -55,4 +55,5 @@ INSERT_GENTPROTCO_BASIC( gemm3m1_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mh_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4mb_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemm4m1_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemm1m_ukr_ref ) diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c new file mode 100644 index 000000000..c4ec44b54 --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -0,0 +1,244 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a1x, \ + ctype* restrict a11, \ + ctype* restrict bx1, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ +\ + PASTECH(ch,trsm_ukr_ft) \ + ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ +\ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ + const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ +\ + ctype bt[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_bt; \ + inc_t cs_bt; \ +\ + inc_t rs_bt_r; \ + inc_t cs_bt_r; \ +\ + const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype_r* restrict a1x_r = ( ctype_r* )a1x; \ +\ + ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + const ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ + const ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* b_use; \ + inc_t rs_b_use; \ + inc_t cs_b_use; \ +\ +\ + /* Handle alphas with non-zero imaginary components. */ \ + /* NOTE: This branch should never execute because alphas with + non-zero imaginary components should be applied during + packing, and so the only alphas we should see here are + those exclusively in the real domain, either because the + value originally had no imaginary compoent (e.g. 4.0) or + because a 1.0 was sent in as a placeholder since the alpha + was applied during packing. */ \ + if ( 0 ) \ + if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ + { \ + bli_abort(); \ +\ +/* + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + const inc_t ld_b = rs_b; \ +\ + PASTEMAC(ch,scal1ms_mxn)( schema_b, \ + mr, \ + nr, \ + alpha, \ + b11, rs_b, cs_b, ld_b ); \ +\ + alpha_r = *one_r; \ +*/ \ + } \ +\ +\ + { \ + /* Set the strides for the temporary bt matrix based on the native + real domain micro-kernel storage preferences. */ \ + if ( col_pref ) { rs_bt = 1; cs_bt = mr; \ + rs_bt_r = 1; cs_bt_r = mr_r; } \ + else { rs_bt = nr; cs_bt = 1; \ + rs_bt_r = nr_r; cs_bt_r = 1; } \ +\ + b_use = ( ctype_r* )bt; \ + rs_b_use = rs_bt_r; \ + cs_b_use = cs_bt_r; \ + } \ +\ +\ + /* Since b11 is stored in the 1e or 1r schema, we cannot update it + directly, and instead must compute the matrix product in a local + temporary microtile and then accumulate it into b11 according to + its schema. */ \ +\ +\ + /* lower: bt = -1.0 * a10 * b01; + upper: bt = -1.0 * a12 * b21; */ \ + rgemm_ukr \ + ( \ + k2, \ + minus_one_r, \ + a1x_r, \ + bx1_r, \ + zero_r, \ + b_use, rs_b_use, cs_b_use, \ + data, \ + cntx \ + ); \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t ld_b = rs_b; \ +\ + ctype* restrict b11_ri = ( ctype* )b11; \ + ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ + ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ +\ + PASTEMAC(ch,sets)( -*beta11_i, \ + *beta11_r, *beta11_ir ); \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t ld_b = rs_b; \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = cs_b; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ +\ + dim_t i, j; \ +\ + /* b11 = alpha * b11 + bt; */ \ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ + ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ + ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ + ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ + ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ +\ + PASTEMAC2(chr,ch,xpbyris)( *beta11t_r, \ + *beta11t_i, \ + alpha_r, \ + alpha_i, /* alpha_i not referenced */ \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + ctrsm_vir_ukr \ + ( \ + a11, \ + b11, \ + c11, rs_c, cs_c, \ + data, \ + cntx \ + ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) +INSERT_GENTFUNCCO_BASIC2( gemmtrsm1m_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h index 7ec51ad8d..615482e41 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_gemmtrsmind_x_ukr_ref.h @@ -55,3 +55,6 @@ INSERT_GENTPROTCO_BASIC( gemmtrsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( gemmtrsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( gemmtrsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c new file mode 100644 index 000000000..ab5617795 --- /dev/null +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -0,0 +1,448 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ + ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ + ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ + ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ + ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a10t_ri = a_ri + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_r = b_r + (0 )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B0_i = b_i + (0 )*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_r = B0_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b01_i = B0_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha10_ri = a10t_ri + (l )*cs_a; \ + ctype_r* restrict alpha10_r = &PASTEMAC(ch,real)( *alpha10_ri ); \ + ctype_r* restrict alpha10_i = &PASTEMAC(ch,imag)( *alpha10_ri ); \ + ctype_r* restrict beta01_r = b01_r + (l )*rs_b2; \ + ctype_r* restrict beta01_i = b01_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_l_ukr_ref ) + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ + const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t m = mr; \ + const dim_t n = nr; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = packmr; \ +\ + const inc_t rs_b = packnr; \ + const inc_t cs_b = 1; \ +\ + const inc_t ld_a = cs_a; \ + const inc_t ld_b = rs_b; \ +\ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + if ( bli_is_1e_packed( schema_b ) ) \ + { \ + const inc_t rs_a2 = 1 * rs_a; \ + const inc_t cs_a2 = 2 * cs_a; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ +\ + ctype* restrict b_ri = ( ctype* )b; \ + ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ + ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ + ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ + ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ + ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ + ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ + ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ + ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ + PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ + } \ + } \ + } \ + else /* if ( bli_is_1r_packed( schema_b ) ) */ \ + { \ + const inc_t rs_b2 = 2 * rs_b; \ + const inc_t cs_b2 = 1 * cs_b; \ +\ + ctype* restrict a_ri = ( ctype* )a; \ + /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ + ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ + ctype* restrict a12t_ri = a_ri + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_r = b_r + (i+1)*rs_b2 + (0 )*cs_b2; \ + ctype_r* restrict B2_i = b_i + (i+1)*rs_b2 + (0 )*cs_b2; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_r = B2_r + (0 )*rs_b2 + (j )*cs_b2; \ + ctype_r* restrict b21_i = B2_i + (0 )*rs_b2 + (j )*cs_b2; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(ch,set0ris)( rho11_r, \ + rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype* restrict alpha12_ri = a12t_ri + (l )*cs_a; \ + ctype_r* restrict alpha12_r = &PASTEMAC(ch,real)( *alpha12_ri ); \ + ctype_r* restrict alpha12_i = &PASTEMAC(ch,imag)( *alpha12_ri ); \ + ctype_r* restrict beta21_r = b21_r + (l )*rs_b2; \ + ctype_r* restrict beta21_i = b21_i + (l )*rs_b2; \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11c_r, \ + beta11c_i, \ + *beta11_r, \ + *beta11_i ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h index abad11caf..77d502a3c 100644 --- a/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h +++ b/frame/ind/ukernels/trsm/bli_trsmind_x_ukr_ref.h @@ -51,3 +51,6 @@ INSERT_GENTPROTCO_BASIC( trsm4m1_u_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_l_ukr_ref ) INSERT_GENTPROTCO_BASIC( trsm3m1_u_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_l_ukr_ref ) +INSERT_GENTPROTCO_BASIC( trsm1m_u_ukr_ref ) + diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c deleted file mode 100644 index 5fc8e012c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ - ctype_r* restrict a10_ri = ( ctype_r* )a10 + 2*is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ - ctype_r* restrict b01_ri = ( ctype_r* )b01 + 2*is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.ri * b01.ri - a10.r * b01.r - a10.i * b01.i ); */ \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* ab.r = a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_r, \ - b01_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_ri, b01_ri, *data ); \ -\ - /* ab.i = a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.ri * b01.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_ri, \ - b01_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r after", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i after", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_r", k, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_i", k, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r", m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i", m, n, \ - b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c deleted file mode 100644 index 9d82ba8c9..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \ - / sizeof( ctype_r ) ] \ - __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ab = 1; \ - const inc_t cs_ab = mr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ - ctype_r* restrict a12_ri = ( ctype_r* )a12 + 2*is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ - ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ - ctype_r* restrict b21_ri = ( ctype_r* )b21 + 2*is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict zero_r = PASTEMAC(chr,0); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( + a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.i - ( a12.ri * b21.ri - a12.r * b21.r - a12.i * b21.i ); */ \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* ab.r = a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_r, \ - b21_r, \ - zero_r, \ - ab_r, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_ri, b21_ri, *data ); \ -\ - /* ab.i = a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - zero_r, \ - ab_i, rs_ab, cs_ab, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.ri * b21.ri; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_ri, \ - b21_ri, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11.r = alpha.r * b11.r - ab.r; - b11.r = b11.r + ab.i; - b11.i = b11.i + ab.r; - b11.i = b11.i + ab.i; */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - { \ - ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ - ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ - ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ - ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ -\ - PASTEMAC(chr,scals)( alpha_r, beta11_r ); \ -\ - PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ - PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ - PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(ch,copyris)( beta11_r, \ - beta11_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Update the ri part of b11. */ \ - PASTEMAC(chr,add3s)( beta11_r, \ - beta11_i, \ - *(b11_ri + i*rs_b + j*cs_b) ); \ - } \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c deleted file mode 100644 index c979d5cbf..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a10, \ - ctype* restrict a11, \ - ctype* restrict b01, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a10_r = ( ctype_r* )a10; \ - ctype_r* restrict a10_i = ( ctype_r* )a10 + is_a; \ -\ - ctype_r* restrict b01_r = ( ctype_r* )b01; \ - ctype_r* restrict b01_i = ( ctype_r* )b01 + is_b; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -printf( "gemmtrsm4m1_l_ukr: is_a = %lu is_b = %lu\n", is_a, is_b ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_r", m, k+m, \ - a10_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1011p_i", m, k+m, \ - a10_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i ); - b11.i = alpha.r * b11.i - ( a10.r * b01.i + a10.i * b01.r ); */ \ -\ - bli_auxinfo_set_next_ab( a10_r, b01_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a10.r * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a10.r * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_r, \ - b01_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a10.i * b01.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a10_i, \ - b01_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a10.i * b01.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a10_i, \ - b01_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r post-gemm", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i post-gemm", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_r after", k+m, n, \ - b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: b0111p_i after", k+m, n, \ - b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_l_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c deleted file mode 100644 index 9d1d1927e..000000000 --- a/frame/ind/ukernels/trsm/old/bli_gemmtrsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid, trsmkerid ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a12, \ - ctype* restrict a11, \ - ctype* restrict b21, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt = PASTEMAC(ch,type); \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - PASTECH(chr,gemm_ukr_ft) \ - rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ -\ - PASTECH(ch,trsm_ukr_ft) \ - ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a12_r = ( ctype_r* )a12; \ - ctype_r* restrict a12_i = ( ctype_r* )a12 + is_a; \ -\ - ctype_r* restrict b11_r = ( ctype_r* )b11; \ - ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \ -\ - ctype_r* restrict b21_r = ( ctype_r* )b21; \ - ctype_r* restrict b21_i = ( ctype_r* )b21 + is_b; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - ctype_r* restrict one_r = PASTEMAC(chr,1); \ - ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ -\ - ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ - ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - void* a_next = bli_auxinfo_next_a( data ); \ - void* b_next = bli_auxinfo_next_b( data ); \ -\ - dim_t i, j; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_r", m, k+m, \ - a11_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: a1112p_i", m, k+m, \ - a11_r+is_a, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_r", k+m, n, \ - b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_ukr: b1121p_i", k+m, n, \ - b11_r+is_b, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ - \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on alpha, but to do - so we have to manually scale b and then use alpha == 1 for the - micro-kernel calls. */ \ - for ( i = 0; i < m; ++i ) \ - for ( j = 0; j < n; ++j ) \ - PASTEMAC(ch,scalris)( alpha_r, \ - alpha_i, \ - *(b11_r + i*rs_b + j*cs_b), \ - *(b11_i + i*rs_b + j*cs_b) ); \ -\ - /* Use alpha.r == 1.0. */ \ - alpha_r = *one_r; \ - } \ -\ -\ - /* b11.r = alpha.r * b11.r - ( a12.r * b21.r - a12.i * b21.i ); - b11.i = alpha.r * b11.r - ( a12.r * b21.i + a12.i * b21.r ); */ \ -\ - bli_auxinfo_set_next_ab( a12_r, b21_i, *data ); \ -\ - /* b11.r = alpha.r * b11.r - a12.r * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_r, \ - &alpha_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_r, *data ); \ -\ - /* b11.i = alpha.r * b11.i - a12.r * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_r, \ - b21_i, \ - &alpha_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ -\ - /* b11.i = 1.0 * b11.i - a12.i * b21.r; */ \ - rgemm_ukr \ - ( \ - k, \ - minus_one_r, \ - a12_i, \ - b21_r, \ - one_r, \ - b11_i, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ - bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ -\ - /* b11.r = 1.0 * b11.r + a12.i * b21.i; */ \ - rgemm_ukr \ - ( \ - k, \ - one_r, \ - a12_i, \ - b21_i, \ - one_r, \ - b11_r, rs_b, cs_b, \ - data, \ - cntx \ - ); \ -\ -\ - /* b11 = inv(a11) * b11; - c11 = b11; */ \ - ctrsm_vir_ukr \ - ( \ - a11, \ - b11, \ - c11, rs_c, cs_c, \ - data, \ - cntx \ - ); \ -} - -INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m1_u_ukr_ref, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c deleted file mode 100644 index 62fff68e0..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_l_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c deleted file mode 100644 index af916ed33..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm3m1_u_ukr_ref.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ - ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ -\ - /* Update the ri part of the packed panel. */ \ - PASTEMAC(chr,add3s)( beta11c_r, \ - beta11c_i, \ - *beta11_ri ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm3m1_u_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c deleted file mode 100644 index 06274d95c..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_l_ukr_ref.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_r", m, m, \ - a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_i", m, m, \ - a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = iter; \ - n_behind = i; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a10t * B0; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a10t * b01; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ - ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ - ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ - ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha10_r, \ - *alpha10_i, \ - *beta01_r, \ - *beta01_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -\ -/* -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r after", m, n, \ - b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i after", m, n, \ - b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ -*/ \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_l_ukr_ref ) - diff --git a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c b/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c deleted file mode 100644 index 5711dc8ce..000000000 --- a/frame/ind/ukernels/trsm/old/bli_trsm4m1_u_ukr_ref.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTFUNCCO -#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - const num_t dt_r = PASTEMAC(chr,type); \ -\ - const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ - const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ -\ - const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \ - const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \ -\ - const dim_t m = mr; \ - const dim_t n = nr; \ -\ - const inc_t is_a = bli_auxinfo_is_a( data ); \ - const inc_t is_b = bli_auxinfo_is_b( data ); \ -\ - ctype_r* restrict a_r = ( ctype_r* )a; \ - ctype_r* restrict a_i = ( ctype_r* )a + is_a; \ -\ - ctype_r* restrict b_r = ( ctype_r* )b; \ - ctype_r* restrict b_i = ( ctype_r* )b + is_b; \ -\ - const inc_t rs_a = 1; \ - const inc_t cs_a = packmr; \ -\ - const inc_t rs_b = packnr; \ - const inc_t cs_b = 1; \ -\ - dim_t iter, i, j, l; \ - dim_t n_behind; \ -\ -\ - for ( iter = 0; iter < m; ++iter ) \ - { \ - i = m - iter - 1; \ - n_behind = iter; \ -\ - ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ - ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ - ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ - ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ -\ - /* b1 = b1 - a12t * B2; */ \ - /* b1 = b1 / alpha11; */ \ - for ( j = 0; j < n; ++j ) \ - { \ - ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ - ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ - ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ - ctype_r beta11c_r = *beta11_r; \ - ctype_r beta11c_i = *beta11_i; \ - ctype_r rho11_r; \ - ctype_r rho11_i; \ -\ - /* beta11 = beta11 - a12t * b21; */ \ - PASTEMAC(chr,set0s)( rho11_r ); \ - PASTEMAC(chr,set0s)( rho11_i ); \ - for ( l = 0; l < n_behind; ++l ) \ - { \ - ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ - ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ - ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ - ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ -\ - PASTEMAC(ch,axpyris)( *alpha12_r, \ - *alpha12_i, \ - *beta21_r, \ - *beta21_i, \ - rho11_r, \ - rho11_i ); \ - } \ - PASTEMAC(ch,subris)( rho11_r, \ - rho11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* beta11 = beta11 / alpha11; */ \ - /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead - of alpha11, so we can multiply rather than divide. We store - the inverse of alpha11 intentionally to avoid expensive - division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scalris)( *alpha11_r, \ - *alpha11_i, \ - beta11c_r, \ - beta11c_i ); \ -\ - /* Output final result to matrix c. */ \ - PASTEMAC(ch,sets)( beta11c_r, \ - beta11c_i, *gamma11 ); \ -\ - /* Store the local values back to b11. */ \ - PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ - PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ - } \ - } \ -} - -INSERT_GENTFUNCCO_BASIC0( trsm4m1_u_ukr_ref ) - diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index ad2bb0b40..9cccce228 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -60,7 +60,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -96,7 +96,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, m ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -139,7 +139,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -186,7 +186,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -281,7 +281,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -319,7 +319,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ @@ -359,7 +359,7 @@ void PASTEMAC(ch,opname) \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ + /*bli_cntx_init_local_if( opname, dt, cntx, cntx_p );*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ diff --git a/mpi_test/Makefile b/mpi_test/Makefile index 1bb965b4a..2d2df10b7 100644 --- a/mpi_test/Makefile +++ b/mpi_test/Makefile @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 7e1fd33bb..433e745a7 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -36,7 +36,7 @@ # Makefile # # Field G. Van Zee -# +# # Makefile for standalone BLIS test drivers. # @@ -107,8 +107,9 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a # BLAS library path(s). This is where the BLAS libraries reside. HOME_LIB_PATH := $(HOME)/flame/lib #MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 -MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 -ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib @@ -168,7 +169,7 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH) LINKER := $(CC) LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64 -LDFLAGS += -lgfortran -lm -lpthread -fopenmp +LDFLAGS += -lgfortran -lm -lrt -lpthread -fopenmp # Datatype @@ -189,6 +190,7 @@ D3M1 := -DIND=BLIS_3M1 D4MHW := -DIND=BLIS_4MH D4M1B := -DIND=BLIS_4M1B D4M1A := -DIND=BLIS_4M1A +D1M := -DIND=BLIS_1M DNAT := -DIND=BLIS_NAT # Implementation string @@ -199,6 +201,7 @@ STR_3M1 := -DSTR=\"3m1\" STR_4MHW := -DSTR=\"4mhw\" STR_4M1B := -DSTR=\"4m1b\" STR_4M1A := -DSTR=\"4m1a\" +STR_1M := -DSTR=\"1m\" STR_NAT := -DSTR=\"asm\" STR_OBL := -DSTR=\"openblas\" STR_MKL := -DSTR=\"mkl\" @@ -213,9 +216,9 @@ PDEF_ST := -DP_BEGIN=40 \ -DP_END=2000 \ -DP_INC=40 -PDEF_MT := -DP_BEGIN=80 \ - -DP_END=4000 \ - -DP_INC=80 +PDEF_MT := -DP_BEGIN=200 \ + -DP_END=10000 \ + -DP_INC=200 @@ -259,6 +262,8 @@ blis-gemm-st: \ test_zgemm_4m1b_blis_st.x \ test_cgemm_4m1a_blis_st.x \ test_zgemm_4m1a_blis_st.x \ + test_cgemm_1m_blis_st.x \ + test_zgemm_1m_blis_st.x \ test_cgemm_asm_blis_st.x \ test_zgemm_asm_blis_st.x @@ -280,6 +285,8 @@ blis-gemm-mt: \ test_zgemm_4m1b_blis_mt.x \ test_cgemm_4m1a_blis_mt.x \ test_zgemm_4m1a_blis_mt.x \ + test_cgemm_1m_blis_mt.x \ + test_zgemm_1m_blis_mt.x \ test_cgemm_asm_blis_mt.x \ test_zgemm_asm_blis_mt.x @@ -290,6 +297,8 @@ openblas-gemm-st: \ test_zgemm_openblas_st.x openblas-gemm-mt: \ + test_sgemm_openblas_mt.x \ + test_dgemm_openblas_mt.x \ test_cgemm_openblas_mt.x \ test_zgemm_openblas_mt.x @@ -300,6 +309,8 @@ mkl-gemm-st: \ test_zgemm_mkl_st.x mkl-gemm-mt: \ + test_sgemm_mkl_mt.x \ + test_dgemm_mkl_mt.x \ test_cgemm_mkl_mt.x \ test_zgemm_mkl_mt.x @@ -310,6 +321,8 @@ acml-gemm-st: \ test_zgemm_acml_st.x acml-gemm-mt: \ + test_sgemm_acml_mt.x \ + test_dgemm_acml_mt.x \ test_cgemm_acml_mt.x \ test_zgemm_acml_mt.x @@ -411,6 +424,19 @@ test_z%_4m1a_blis_mt.o: test_%.c test_c%_4m1a_blis_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_MT) -c $< -o $@ +# blis 1m +test_z%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_c%_1m_blis_st.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@ + +test_z%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + +test_c%_1m_blis_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@ + # blis asm test_d%_asm_blis_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@ @@ -449,6 +475,12 @@ test_z%_openblas_st.o: test_%.c test_c%_openblas_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@ +test_d%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + +test_s%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + test_z%_openblas_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ @@ -468,6 +500,12 @@ test_z%_mkl_st.o: test_%.c test_c%_mkl_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@ +test_d%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + +test_s%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + test_z%_mkl_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ @@ -487,6 +525,12 @@ test_z%_acml_st.o: test_%.c test_c%_acml_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ +test_d%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + +test_s%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + test_z%_acml_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index bb65a5db5..3f5d89023 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -4,17 +4,21 @@ exec_root="test" out_root="output" -sys="blis" +#sys="blis" #sys="stampede" +sys="lonestar" #sys="wahlberg" # Bind threads to processors. #export OMP_PROC_BIND=true #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15" #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" -export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" #export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7" #export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7" +#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45" +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" # Modify LD_LIBRARY_PATH. if [ ${sys} = "blis" ]; then @@ -26,6 +30,11 @@ elif [ ${sys} = "stampede" ]; then # A hack to use libiomp5 with gcc. export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64" +elif [ ${sys} = "lonestar" ]; then + + # A hack to use libiomp5 with gcc. + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + elif [ ${sys} = "wahlberg" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HOME/flame/lib/acml/5.3.1/gfortran64_int64/lib" @@ -49,6 +58,14 @@ elif [ ${sys} = "stampede" ]; then ir_nt=1 # 1st loop nt=16 +elif [ ${sys} = "lonestar" ]; then + + jc_nt=2 # 5th loop + ic_nt=12 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=24 + elif [ ${sys} = "wahlberg" ]; then jc_nt=1 # 5th loop @@ -59,8 +76,10 @@ elif [ ${sys} = "wahlberg" ]; then fi # Threadedness to test. -threads="st mt" # st mt" -threads_r="st mt" # mt" +#threads="mt" +#threads_r="mt" +threads="st" +threads_r="st" # Datatypes to test. dts="z c" @@ -75,22 +94,31 @@ test_ops_r="${l3_ops}" if [ ${sys} = "blis" ]; then #test_impls="openblas mkl 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" - test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" elif [ ${sys} = "stampede" ]; then - test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" + #test_impls="openblas mkl asm_blis" + +elif [ ${sys} = "lonestar" ]; then + + test_impls="asm_blis 4mhw_blis 4m1a_blis 1m_blis 3m1_blis" + #test_impls="1m_blis 3m1_blis" + #test_impls="4m1a_blis" + #test_impls="mkl" #test_impls="openblas mkl asm_blis" elif [ ${sys} = "wahlberg" ]; then - test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis" + test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" test_impls="openblas acml asm_blis" fi # Real domain implementations to test. #test_impls_r="openblas mkl asm_blis" -test_impls_r="openblas asm_blis" +test_impls_r="asm_blis" +#test_impls_r="" # First perform real test cases. for th in ${threads_r}; do @@ -112,10 +140,11 @@ for th in ${threads_r}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" + # export MKL_NUM_THREADS=${nt} #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -124,7 +153,6 @@ for th in ${threads_r}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. @@ -165,10 +193,10 @@ for th in ${threads}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -177,7 +205,6 @@ for th in ${threads}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index c00ca4e25..1f9ea036c 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -49,6 +49,7 @@ int main( int argc, char** argv ) dim_t p; dim_t p_begin, p_end, p_inc; int m_input, n_input, k_input; + ind_t ind; num_t dt; char dt_ch; int r, n_repeats; @@ -70,6 +71,8 @@ int main( int argc, char** argv ) dt = DT; + ind = IND; + p_begin = P_BEGIN; p_end = P_END; p_inc = P_INC; @@ -78,19 +81,28 @@ int main( int argc, char** argv ) n_input = -1; k_input = -1; + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + #if 1 - cntx_t cntx; + cntx_t cntx; - // Initialize a context for the current induced method and datatype. - bli_gemm_cntx_init( &cntx ); + ind_t ind_mod = ind; - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + bli_gemmind_cntx_init( ind_mod, dt, &cntx ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); #elif 0 - k_input = 256; + k_input = 256; #endif @@ -150,14 +162,14 @@ int main( int argc, char** argv ) bli_obj_set_conjtrans( transb, b ); bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( -(1.0/1.0), 0.0, &beta ); + bli_setsc( (1.0/1.0), 0.0, &beta ); bli_copym( &c, &c_save ); #ifdef BLIS bli_ind_disable_all_dt( dt ); - bli_ind_enable_dt( IND, dt ); + bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX; diff --git a/test/Makefile b/test/Makefile index aaf7381a2..0a7faf4e7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/testsuite/Makefile b/testsuite/Makefile index 6a1954d8c..acbdd7bf3 100644 --- a/testsuite/Makefile +++ b/testsuite/Makefile @@ -1,4 +1,4 @@ -#!/bin/bash +# # # BLIS # An object-based framework for developing high-performance BLAS-like diff --git a/testsuite/input.general b/testsuite/input.general index 0bf9053bd..b7fbd6b58 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -9,7 +9,7 @@ # 1 # Number of repeats per experiment (best result is reported) -c # Matrix storage scheme(s) to test: +rc # Matrix storage scheme(s) to test: # 'c' = col-major storage; 'g' = general stride storage; # 'r' = row-major storage c # Vector storage scheme(s) to test: @@ -26,16 +26,17 @@ sdcz # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex 100 # Problem size: first to test -400 # Problem size: maximum to test +500 # Problem size: maximum to test 100 # Problem size: increment between experiments # Complex level-3 implementations to test -1 # 3mh ('1' = enable; '0' = disable) -1 # 3m3 ('1' = enable; '0' = disable) -1 # 3m2 ('1' = enable; '0' = disable) -1 # 3m1 ('1' = enable; '0' = disable) -1 # 4mh ('1' = enable; '0' = disable) -1 # 4m1b ('1' = enable; '0' = disable) -1 # 4m1a ('1' = enable; '0' = disable) +0 # 3mh ('1' = enable; '0' = disable) +0 # 3m3 ('1' = enable; '0' = disable) +0 # 3m2 ('1' = enable; '0' = disable) +0 # 3m1 ('1' = enable; '0' = disable) +0 # 4mh ('1' = enable; '0' = disable) +0 # 4m1b ('1' = enable; '0' = disable) +0 # 4m1a ('1' = enable; '0' = disable) +1 # 1m ('1' = enable; '0' = disable) 1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 6f5515127..41c0b9160 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -168,7 +168,7 @@ void libblis_test_axpy2v_experiment cntx_t cntx; // Initialize a context. - bli_axpy2v_cntx_init( &cntx ); + bli_axpy2v_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 706359ca4..8da15c315 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -166,7 +166,7 @@ void libblis_test_axpyf_experiment cntx_t cntx; // Initialize a context. - bli_axpyf_cntx_init( &cntx ); + bli_axpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 36b88cc2f..6c1440e95 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -171,7 +171,7 @@ void libblis_test_dotaxpyv_experiment cntx_t cntx; // Initialize a context. - bli_dotaxpyv_cntx_init( &cntx ); + bli_dotaxpyv_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index dd83dc49e..a7abdba87 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -176,7 +176,7 @@ void libblis_test_dotxaxpyf_experiment cntx_t cntx; // Initialize a context. - bli_dotxaxpyf_cntx_init( &cntx ); + bli_dotxaxpyf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 3a29b41b7..8adec7c1d 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -168,7 +168,7 @@ void libblis_test_dotxf_experiment cntx_t cntx; // Initialize a context. - bli_dotxf_cntx_init( &cntx ); + bli_dotxf_cntx_init( datatype, &cntx ); // Map the dimension specifier to an actual dimension. m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index 222dca395..89a8bd7c3 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -259,8 +259,6 @@ void libblis_test_gemm_impl { case BLIS_TEST_SEQ_FRONT_END: bli_gemm( alpha, a, b, beta, c ); - //bli_gemm4m( alpha, a, b, beta, c ); - //bli_gemm3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index 514fdf66a..f418ac6e5 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -173,7 +173,7 @@ void libblis_test_gemm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_gemm_cntx_init( &cntx ); + bli_gemm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index afd436d7f..172ff053a 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -198,7 +198,7 @@ void libblis_test_gemmtrsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index bd14d13b4..993c134b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -427,6 +427,10 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); + // Read whether to enable 1m. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_1M ]) ); + // Read whether to native (complex) execution. libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); @@ -597,8 +601,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) //char int_type_size_str[8]; gint_t int_type_size; ind_t im; - cntx_t cntx_s; - cntx_t* cntx = &cntx_s; + cntx_t cntx_local; + cntx_t cntx_local_c; + cntx_t cntx_local_z; + cntx_t* cntx = &cntx_local; + cntx_t* cntx_c = &cntx_local_c; + cntx_t* cntx_z = &cntx_local_z; // If bli_info_get_int_type_size() returns 32 or 64, the size is forced. // Otherwise, the size is chosen automatically. We query the result of @@ -721,7 +729,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmnat_cntx_init( cntx ); + // Initialize a context for the gemm family, assuming native execution. + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_gemmnat_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "level-3 blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d %7d %7d\n", @@ -825,42 +836,43 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_ind_oper_get_avail_impl_string( BLIS_GEMM, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_init( im, cntx ); + bli_gemmind_cntx_init( im, BLIS_SCOMPLEX, cntx_c ); + bli_gemmind_cntx_init( im, BLIS_DCOMPLEX, cntx_z ); libblis_test_fprintf_c( os, "level-3 blocksizes c z \n" ); libblis_test_fprintf_c( os, " mc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MC, cntx_z ) ); libblis_test_fprintf_c( os, " kc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_KC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_KC, cntx_z ) ); libblis_test_fprintf_c( os, " nc maximum %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NC, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NC, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr %7d %7d\n", - ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, " mr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_MR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_MR, cntx_z ) ); libblis_test_fprintf_c( os, " nr packdim %7d %7d\n", - ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx ), - ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx ) ); + ( int )bli_cntx_get_blksz_max_dt( BLIS_SCOMPLEX, BLIS_NR, cntx_c ), + ( int )bli_cntx_get_blksz_max_dt( BLIS_DCOMPLEX, BLIS_NR, cntx_z ) ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "micro-kernel types c z\n" ); libblis_test_fprintf_c( os, " gemm %7s %7s\n", @@ -880,14 +892,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) bli_info_get_trsm_u_ukr_impl_string( im, BLIS_DCOMPLEX ) ); libblis_test_fprintf_c( os, "\n" ); - bli_gemmind_cntx_finalize( im, cntx ); + bli_gemmind_cntx_finalize( im, cntx_c ); + bli_gemmind_cntx_finalize( im, cntx_z ); } bli_ind_disable_all(); // We use hemv's context because we know it is initialized with all of the fields // we will be outputing. - bli_hemv_cntx_init( cntx ); + // We use BLIS_DOUBLE for the datatype, but the dt argument is actually + // only used when initializing contexts for induced methods. + bli_hemv_cntx_init( BLIS_DOUBLE, cntx ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "--- BLIS misc. other info ---\n" ); @@ -955,6 +970,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); + libblis_test_fprintf_c( os, " 1m? %u\n", params->ind_enable[ BLIS_1M ] ); libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index bf5f2d6bd..e7ccb4b43 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -178,7 +178,7 @@ void libblis_test_trsm_ukr_experiment cntx_t cntx; // Initialize a context. - bli_trsm_cntx_init( &cntx ); + bli_trsm_cntx_init( datatype, &cntx ); // Fix m and n to MR and NR, respectively. m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx ); diff --git a/version b/version index 0c62199f1..ee1372d33 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.1 +0.2.2