diff --git a/CHANGELOG b/CHANGELOG index a5aae8601..14a619026 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,918 @@ -commit 089048d5895a30221b6b1976c9be93ad6443420d (HEAD, tag: 0.1.0, origin/master, master) +commit fde5f1fdece19881f50b142e8611b772a647e6d2 (HEAD, tag: 0.1.1, origin/master, origin/HEAD, master) +Author: Field G. Van Zee +Date: Tue Feb 25 13:34:56 2014 -0600 + + Added extensive support for configuration defaults. + + Details: + - Standard names for reference kernels (levels-1v, -1f and 3) are now + macro constants. Examples: + BLIS_SAXPYV_KERNEL_REF + BLIS_DDOTXF_KERNEL_REF + BLIS_ZGEMM_UKERNEL_REF + - Developers no longer have to name all datatype instances of a kernel + with a common base name; [sdcz] datatype flavors of each kernel or + micro-kernel (level-1v, -1f, or 3) may now be named independently. + This means you can now, if you wish, encode the datatype-specific + register blocksizes in the name of the micro-kernel functions. + - Any datatype instances of any kernel (1v, 1f, or 3) that is left + undefined in bli_kernel.h will default to the corresponding reference + implementation. For example, if BLIS_DGEMM_UKERNEL is left undefined, + it will be defined to be BLIS_DGEMM_UKERNEL_REF. + - Developers no longer need to name level-1v/-1f kernels with multiple + datatype chars to match the number of types the kernel WOULD take in + a mixed type environment, as in bli_dddaxpyv_opt(). Now, one char is + sufficient, as in bli_daxpyv_opt(). + - There is no longer a need to define an obj_t wrapper to go along with + your level-1v/-1f kernels. The framework now prvides a _kernel() + function which serves as the obj_t wrapper for whatever kernels are + specified (or defaulted to) via bli_kernel.h + - Developers no longer need to prototype their kernels, and thus no + longer need to include any prototyping headers from within + bli_kernel.h. The framework now generates kernel prototypes, with the + proper type signature, based on the kernel names defined (or defaulted + to) via bli_kernel.h. + - If the complex datatype x (of [cz]) implementation of the gemm micro- + kernel is left undefined by bli_kernel.h, but its same-precision real + domain equivalent IS defined, BLIS will use a 4m-based implementation + for the datatype x implementations of all level-3 operations, using + only the real gemm micro-kernel. + +commit 15b51e990f1d21333b5f7af97c211756247336e5 +Merge: 6363a9f fc04b5e +Author: Field G. Van Zee +Date: Fri Feb 21 09:04:32 2014 -0600 + + Merge branch 'master' of github.com:fgvanzee/blis + +commit fc04b5eb69868c341ce03f5ef1f02de4b8c121b0 +Merge: b29e1c2 d1813c9 +Author: Field G. Van Zee +Date: Fri Feb 21 09:04:13 2014 -0600 + + Merge pull request #3 from figual/master + + New ARM armv7a kernels and Assembly file consideration in Makefile + +commit d1813c9dee34410833db5061e6588ec1a6c9ecd4 +Author: Francisco Igual +Date: Fri Feb 21 15:14:31 2014 +0100 + + Added new armv7a micro-kernels and configuration files from Werner Saar. + +commit 0cd098c03a000ed9426a7e9135190696da8cadbc +Author: Francisco Igual +Date: Fri Feb 21 15:12:30 2014 +0100 + + o Modified Makefile to consider .S assembly microkernels. + +commit 6363a9f658257fe3d814a3dce5308f807adb54a2 +Author: Field G. Van Zee +Date: Wed Feb 19 17:00:52 2014 -0600 + + Added level-3 support for complex via 4m-/3m. + + Details: + - Added the ability to induce complex domain level-3 operations via new + virtual complex micro-kernels which are implemented via only real + domain micro-kernels. Two new implementations are provided: 4m and 3m. + 4m implements complex matrix multiplication in terms of four real + matrix multiplications, where as 3m uses only three and thus is + capable of even higher (than peak) performance. However, the 3m method + has somewhat weaker numerical properties, making it less desirable + in general. + - Further refined packing routines, which were recently revamped, and + added packing functionality for 4m and 3m. + - Some modifications to trmm and trsm macro-kernels to facilitate indexing + into micro-panels which were packed for 4m/3m virtual kernels. + - Added 4m and 3m interfaces for each level-3 operation. + - Various other minor changes to facilitate 4m/3m methods. + +commit b29e1c2b278c177e104c84ba462820ee8296df6c +Merge: ee60377 bd3c7ec +Author: Field G. Van Zee +Date: Fri Feb 14 14:11:54 2014 -0600 + + Merge pull request #2 from tlrmchlsmth/master + + Fixes and improvements to xeon phi implementation. + +commit bd3c7ecfb54a9b9851c7d364f41c21e4cff52f6f +Author: Tyler Smith +Date: Fri Feb 14 14:05:57 2014 -0600 + + Removing changes to input.general and input.operations + +commit ce066863683cb4e910270cf8ab8e138b01ff3358 +Author: Tyler Smith +Date: Fri Feb 14 13:40:24 2014 -0600 + + Fixed more Xeon Phi bugs, especially with scattered update + +commit 31134b5c7076423aee1b4f494e925f27171d97e6 +Author: Tyler Smith +Date: Fri Feb 14 11:19:44 2014 -0600 + + Some fixes, changes, and improvements to the microkernel to the Xeon Phi + +commit ee60377e467862b9d8a7205c45dce5cf66c78c46 +Author: Field G. Van Zee +Date: Thu Feb 13 14:03:31 2014 -0600 + + Shifted some fields in info_t. + + Details: + - Shifted the pack order, pack buffer type, and structure type fields + to make room for an extra bit in the pack type/status field. + +commit bd3ab1ad4cf42f8bc30ab262acf8eccb49bb1a08 +Author: Field G. Van Zee +Date: Thu Feb 13 09:29:55 2014 -0600 + + Minor fixes to trsm consistent with prev on trmm. + + Details: + - Removed use of bli_min() and bli_max() that were only being used to + try to support situations where the diagonal would intersect the + short end of some micro-panels, which is situation that is disallowed + at a higher level by various constraints on the register and cache + blocksize. This only affected trsm_ll and trsm_lu. + - Use panel stride as passed into the macro-kernel rather than compute + it via k and PACKMR/PACKNR. This affects all macro-kernels of trsm. + +commit 6260b0b5f8bd248f3f66e5a1c6854bdbd9d02ad0 +Author: Field G. Van Zee +Date: Thu Feb 13 09:19:56 2014 -0600 + + Fixed obscure bug in trmm_ll, trmm_lu. + + Details: + - Fixed an obscure bug in left-hand trmm that would only manifest when + non-zero register blocksize extensions (PACKMR > MR or PACKNR > NR) + are used. + - Removed use of bli_min() and bli_max() that were only being used to + try to support situations where the diagonal would intersect the + short end of some micro-panels, which is situation that is disallowed + at a higher level by various constraints on the register and cache + blocksize. This only affected trmm_ll and trmm_lu. + - Use panel stride as passed into the macro-kernel rather than compute + it via k and PACKMR/PACKNR. This affects all macro-kernels of trmm. + +commit 16915c1c1e55c660bf82141cdadf7c0860d5b464 +Author: Field G. Van Zee +Date: Tue Feb 11 10:54:19 2014 -0600 + + Fixed an obscure bug in packm_cxk(). + + Details: + - Fixed a bug in packm_cxk() whereby the packm ukernel was being chosen + from ldp, which is always equal to PACKMR or PACKNR. The problem with + this is that the pack ukernels were implicitly assuming that the + panel dimension of the panel being packed was equal to ldp, which + is not the case when the register blocksizes extensions are non-zero + (ie: when PACKMR > MR or PACKNR > NR, whichever is applicable). This + problem has been fixed by passing ldp into the pack ukernels, which + now walk through the packed micro-panel region by incrementing by this + value, rather than incrementing by the inherent panel dimension value + assumed by each packm ukernel (e.g. 4 in the case of packm_ref_4xk). + - Also fixed a very minor edge case inefficiency whereby pack ukernels + smaller than the default were not being used in edge cases, and instead + those situations were being handled by scal2m. This is related to the + issue above, because the pack ukernel itself was being chosen based on + ldp instead of the panel dimension. + +commit b7da57b282c5a5e2208946e60309d2352f55351d +Author: Field G. Van Zee +Date: Tue Feb 11 10:28:23 2014 -0600 + + Updated calls to packm_blk_var2() in testsuite. + + Details: + - In ukernel testsuite modules, replaced calls to packm_blk_var2() with + _var1(). Meant to include this in previous commit. + +commit c255a293e25b2223c88e8800267cd06ad2a90041 +Author: Field G. Van Zee +Date: Mon Feb 10 14:31:24 2014 -0600 + + Consolidated packm_blk_var2 and var3. + + Details: + - Consolidated the functionality previously supported by packm_blk_var2() + and packm_blk_var3() into a new variant, packm_blk_var1(). + - Updates to packm_gen_cxk(), packm_herm_cxk.c(), and packm_tri_cxk() + to accommodate above changes. + - Removed packm_blk_var3() and retired packm_blk_var2() to + frame/1m/packm/old. + - Updated all level-3 _cntl_init() functions so that the new, more + versatile packm_blk_var1 is used for all level-3 matrix packing. + +commit 32d8f264ae7b28155f5d7b21dcc5ecb78da2e0ab +Author: Field G. Van Zee +Date: Sun Feb 9 10:07:37 2014 -0600 + + Refactored packm variants. + + Details: + - Revised packm_blk_var2() and _var3() by encapsulating the general, + hermitian/symmetric, and triangular panel-packing subproblems into + separate functions: packm_gen_cxk(), packm_herm_cxk(), and + packm_tri_cxk(), respectively. Also, homogenized the packm code as + well as the new specialized packm_*_cxk() code to further improve + readability. + +commit 6c8067028707947fcdf4f856a272e15bb9ed91e3 +Author: Field G. Van Zee +Date: Fri Feb 7 11:27:15 2014 -0600 + + Renamed enumerated type in testsuite and modules. + + Details: + - Renamed the test suite's "mt_impl_t" enumerated type to "iface_t", and + renamed all corresponding "impl" variables to "iface". + +commit 6c12598b1bc567f0b08f58aebdc753a1c1390378 +Author: Field G. Van Zee +Date: Thu Feb 6 18:26:35 2014 -0600 + + Employ simpler INSERT_ macro for ref ukernels. + + Details: + - Defined a new macro, INSERT_GENTFUNC_BASIC0, which takes only one + argument--the base name of the function--and employed this macro + in the reference micro-kernel files instead of the _BASIC macro, + which takes one auxiliary argument. That argument was not being + used and probably just acted to unnecessarily obfuscate. + +commit 32cae66326b68706d0e695cfd60c9ca5bc32c534 +Author: Field G. Van Zee +Date: Thu Feb 6 18:06:42 2014 -0600 + + Fixed some instances of sloppy 'restrict' usage. + + Details: + - Fixed some technical incorrectness with some usage of the 'restrict' + keyword in the reference trsm micro-kernels. + - Tweak to testsuite/Makefile that causes rebuild if libblis was + touched. + +commit 7aceef7683e2a2aff3c7ec2a73508036af2e19e2 +Author: Field G. Van Zee +Date: Thu Feb 6 17:31:19 2014 -0600 + + Updated comments in macro-kernels. + + Details: + - Updated (and fixed some errors in) the "Assumptions/assertions" comment + section of macro-kernels. + - Changed register blocksizes of reference configuration to MR = 8 and + NR = 4. It's always good for MR != NR in the reference configuration + since it may help uncover bugs related to non-square micro-kernels. + +commit 8fd292aa78950bcdf556605718f09d13f9575abc +Author: Field G. Van Zee +Date: Thu Feb 6 14:32:21 2014 -0600 + + Pass panel dimensions into macro-kernels. + + Details: + - Modified the interfaces to the datatype-specific macro-kernels so that: + - pd_a and pd_b are passed in (which contain the panel dimensions of + packed panels of a and b). + - rs_a and cs_b are no longer passed in (they were guaranteed to be 1). + - Modified implementations of datatype-specific macro-kernels so pd_a, + pd_b, cs_a, and rs_b are used instead of cpp macros for MR, NR, PACKMR, + and PACKNR, respectively. + - Declare temporary c matrices (ct) as being maxmr-by-maxnr, which for now + is equivalent to being mr-by-nr. maxmr and maxnr are declared in a new + header file bli_kernel_post_macro_defs.h. + +commit 3404e6657eabb017cd1580a2f1dd8e6fb13df923 +Author: Field G. Van Zee +Date: Wed Feb 5 11:19:10 2014 -0600 + + Deprecated incremental blocksize macro const defs. + + Details: + - Removed macro constant definitions related to incremental blocksizes + from all configurations' bli_kernel.h files. This change is minor and + is mostly a cleanup related to a previous commit. + +commit 1e9afd39a63e0a58167d4439c1a0a880a4a35657 +Author: Field G. Van Zee +Date: Tue Feb 4 20:15:19 2014 -0600 + + Comment updates (removed vestiges of "bd"). + +commit 5cf58f7c2d5bc0d2d94d9576f7158d8f133b7aac +Author: Field G. Van Zee +Date: Tue Feb 4 09:15:19 2014 -0600 + + Added early returns for "object is zeros" case. + + Details: + - Added some logic to packm_init(), pack_int() and gemm_int() so that + (a) objects marked as BLIS_ZEROS are not packed, and (b) those + objects are not computed with. This functionality is not currently + needed by any existing implementations, but may be used in the + future. + +commit 6bbd4be769a9b344a55abe5ddaca1a99fd29f7b4 +Author: Field G. Van Zee +Date: Mon Feb 3 13:15:25 2014 -0600 + + Added 'f' on some gemm and trmm blocked variants. + + Details: + - Added 'f' to some block variant files/functions to be consistent with + other file/functions' naming convention. Here, the f indicates + partitioning in the "forward" direction. + +commit eb13cb2c6b182df5e2a9b88c76f50e2cee25b9e0 +Author: Field G. Van Zee +Date: Mon Feb 3 11:07:01 2014 -0600 + + Removed redundant non-gemm blksz_t creation. + + Details: + - Removed code that creates duplicate blksz_t objects for herk, trmm, + and trsm. Instead, the gemm blksz_t objects are accessed via extern + and used directly. This reduces the amount of code associated with + each of the three _cntl_init() and _cntl_finalize() function. + +commit 0a023a7d9e58e53b8c204a5f49aa8ca9afeba938 +Author: Field G. Van Zee +Date: Wed Jan 29 14:02:08 2014 -0600 + + Introduced new level-3 front-end layer. + + Details: + - Added new _front() functions for each level-3 operation. This is done + so that the choosing of the control tree (and *only* the choosing of + the control tree) happens in what was previously the "front end" + (e.g. bli_gemm()). That control tree is then passed into the _front() + function, which then performs up-front tasks such as parameter + checking. + +commit 251c5d112196d37b183e554bc9d406104aed65fb +Author: Field G. Van Zee +Date: Tue Jan 28 19:40:29 2014 -0600 + + Removed redundant hemm, her2k control trees. + + Details: + - Removed code that generated a control tree specifically for hemm and + symm. Instead, the gemm control tree is now configured so that it + works for gemm, hemm, or symm. + - Retired most her2k code, as it was not being used. (Currently, her2k is + implemented as two invocations of herk.) I couldn't think of many + situations where her2k variants were needed. + - Removed some older her2k code. + +commit 5a36e5bf2f59d1e85d6dbce32a07d604c5e82d11 +Author: Field G. Van Zee +Date: Mon Jan 27 11:13:00 2014 -0600 + + Embed func_t microkernel objects in control trees. + + Details: + - Modified all control tree node definitions to include a new field of + type func_t*, which is similar to a blksz_t except that it contains + one function pointer (each typed simply as void*) for each datatype. + We use the func_t* to embed pointers to the micro-kernels to use for + the leaf-level nodes of each control tree. This change is a natural + extension of control trees and will allow more flexibility in the + future. + - Modified all macro-kernel wrappers to obtain the micro-kernel pointers + from the incomming (previously ignored) control tree node and then pass + the queried pointer into the datatype-specific macro-kernel code, which + then casts the pointer to the appropriate type (new typedefs residing + in bli_kernel_type_defs.h) and then uses the pointer to call the micro- + kernel. Thus, the micro-kernel function is no longer "hard-coded" (that + is, determined when the datatype-specific macro-kernel functions are + instantiated by the C preprocessor). + - Added macros to bli_kernel_macro_defs.h that build datatype-specific + base names if they do not exist already, and then uses those to build + datatype-specific micro-kernel function names. This will allow + developers extra flexibility if they wanted to, for example, name each + of their datatype-specific micro-kernels differently (e.g. double + real might be named bli_dgemm_opt_4x4() while double complex might be + named bli_zgemm_opt_2x2()). + - Inserted appropriate code into _cntl_init() functions that allocates + and initializes a func_t object for the corresponding micro-kernels. + The gemm ukernel func_t object is created once, in bli_gemm_cntl_init(), + and then reused via extern wherever possible. + +commit 6cbd6f1c7f1915180aa28939833afde48665c5ae +Author: Field G. Van Zee +Date: Fri Jan 24 10:38:29 2014 -0600 + + Removed commented mixed domain macro-kernel code. + + Details: + - Removed commented-out code from macro-kernels that was supposed to + facilitate implementing mixed domain (complex times real) matrix + multiplication. This functionality is still (probably possible), + but I'm getting tired of looking at the code every time I edit + a macro-kernel. Plus, there are probably ways of doing it at a + higher level, via control trees. + +commit 29778be1119f1a884330d7f8dc424a2df4101d58 +Author: Field G. Van Zee +Date: Wed Jan 22 16:03:11 2014 -0600 + + Removed b_aux field from cntl nodes. + + Details: + - Removed b_aux field from all control tree node definitions. This field + was being used in certain optimizations (incremental blocking) that were + not actually being employed within BLIS, and are probably not employed + by others. + - Updated all _cntl_obj_create() function definitions and invocations + according to above change. + - Retired bli_gemm_blk_var4.c, which was one such function that employed + incremental blocking, but which was never called by BLIS itself. + +commit 06ac727a42ec9e832c7832745036702014638f99 +Author: Field G. Van Zee +Date: Wed Jan 15 16:44:52 2014 -0600 + + Updated some comments in level-3 front ends. + +commit d628bf1da1560f1f5126a1ddfed8714f0a4b8da3 +Author: Field G. Van Zee +Date: Wed Jan 15 11:40:12 2014 -0600 + + Consolidated pack_t enums; retired VECTOR value. + + Details: + - Changed the pack_t enumerations so that BLIS_PACKED_VECTOR no longer has + its own value, and instead simply aliases to BLIS_PACKED_UNSPEC. This + makes room in the three pack_t bits of the info field of obj_t so that + two values are now unused, and may be used for other future purposes. + - Updated sloppy terminology usage in comments in level-2 front-ends. + (Replaced "is contiguous" with more accurate "has unit stride".) + +commit ddc8c1c379b4787be5954802906593d7ea144452 +Author: Field G. Van Zee +Date: Mon Jan 13 14:55:43 2014 -0600 + + Suppress warning in Makefile (UNINSTALL_LIBS). + + Details: + - Redirect errors to /dev/null when using 'find' to locate libraries that + would be uninstalled upon executing "make uninstall-old". Before, if the + Makefile was read before $(INSTALL_PREFIX)/lib existed, a "No such file + or directory" message was emitted. This message was harmless, but is now + suppressed in this situation. + +commit f8f67d7251bffc05020e20527c100c8115fd5e55 +Author: Field G. Van Zee +Date: Fri Jan 10 09:06:11 2014 -0600 + + Typecast bli_getopt() return value in testsuite. + + Details: + - In the test suite driver, inserted an explicit typecast of the return + value of bli_getopt() prior parsing. The lack of typecast caused a + problem on at least one system whereby a return value of -1 was + interpreted as garbage character. Thanks to Francisco Igual for finding + and submitting this fix. + +commit e7f154fe2ed3e10e2323cefe5d25c2c23ac902c4 +Author: Field G. Van Zee +Date: Fri Jan 10 08:48:07 2014 -0600 + + Applied edge case fix to arm/neon microkernel. + + Details: + - Applied an edge case bugfix, courtesy of Francisco Igual, to the current + double precision real gemm microkernel in kernels/arm/neon/3. + +commit 89c76a8a51d070d263c13bfa5ace65769509f2b4 +Author: Field G. Van Zee +Date: Thu Jan 9 12:08:37 2014 -0600 + + Allow building outside source distribution. + + Details: + - Modified build system (mostly configure and top-level Makefile) so that + a user can build a BLIS library outside of the top-level directory of + the source distribution. + - Added "test" target to Makefile so that the user can run "make test", + which will compile, link, and run the testsuite binary. This works even + if the build directory is externally located, thanks to the test suite + binary's new -g and -o command-line options. Also, when creating the + test suite via the top-level Makefile, the linking is against the + local archive, in lib/, rather than at /lib. + - Modified testsuite/Makefile so that it links against the library built + locally, in ../lib/. + - Added "-lm" to LDFLAGS of most configurations' make_defs.mk. + - Various other cleanups to build system. + +commit 12fa82ec12cc340ab28552997d9d50f7c98691f8 +Author: Field G. Van Zee +Date: Wed Jan 8 16:09:26 2014 -0600 + + Implemented bli_getopt(). + + Details: + - Added bli_getopt.c and .h files to frame/base. These files implement + a custom version of getopt(), which may be used to parse command line + options passed into a program via argc/argv. I am implementing this + function myself, as opposed to using the version available via unistd.h, + for portability reasons, as the only requirements are string.h (which + is available via the standard C library). + - Modified test suite to allow the user to specify the file name (and/or + path) to the parameters and operations input files: -g may be used to + specify the general input file and -o to specify the operations input + file). If -g or -o or both are not given, default filenames are assumed + (as well as their existence in the current directory). + +commit cafb58e86ea5cfb21b9eedc57ca8ebbf24252098 +Author: Field G. Van Zee +Date: Mon Jan 6 13:28:36 2014 -0600 + + Updated template micro-kernels to use auxinfo_t. + + Details: + - Updated template micro-kernel implementations (located in + config/template/kernels), to adhere to the new auxinfo_t interface. + Meant to include this change in a0331fb1. + - Changed template configuration to use 64-bit integers (for both BLIS + and the BLAS compatibility layer). + +commit 9ab126b499c3805045020cb89a8a5848e28d3bf5 +Author: Field G. Van Zee +Date: Mon Jan 6 12:13:26 2014 -0600 + + Removed error checks in netlib->BLIS param mapping + + Details: + - Disabled error checking in netlib-to-BLIS parameter mapping functions. + If the char value input to these functions was not one of the defined + values, bli_check_error_code() with the appropriate error code value + would be called, resulting in an abort(). This was unnecessary and + redundant since these routines are currently only used within the + BLAS compatibility layer, and they are only called AFTER parameter + checking has already been performed on the original BLAS char values. + If the application tried to override xerbla() to prevent an abort() + from being called, this error checking would still get in the way. + Thus, instead of reporting the error situation to the framework (ie: + calling abort()), an arbitrary BLIS parameter value is now chosen and + the function returns normally. Thanks to Jeff Hammond for finding and + reporting this issue. + +commit 2cb13600f9f9601c60e7f96f4ca159d169ade9cb +Author: Field G. Van Zee +Date: Fri Jan 3 12:29:13 2014 -0600 + + Updated year in copyright headers to 2014. + +commit 290fa54e0083c9c837188b8321b13b1b282e7b0c +Author: Field G. Van Zee +Date: Fri Dec 20 14:10:26 2013 -0600 + + Store variable panel strides in trmm/trsm auxinfo. + + Details: + - Changed the value being stored into the auxinfo_t structure in trmm + and trsm macro-kernels. Whereas before we stored whatever value was + provided to the macro-kernel implementation via ps_a/ps_b, now we + store the stride that will advance to the next variable-length + micro-panel of the triangular matrix A (left) or B (right). + - Whitespace changes to the files affected above. + +commit e3a6c7e77667fd749248df3f75f880266c3136ec +Author: Field G. Van Zee +Date: Thu Dec 19 16:29:31 2013 -0600 + + Macroized conditionals for a2/b2 in macro-kernels. + + Details: + - Replaced conditional expressions in macro-kernels related to computing + the addresses a2 and b2 (a_next and b_next) with a preprocessor macro + invocation, bli_is_last_iter(), that tests the same condition. + - Updated gemm_ukr module to use auxinfo_t argument. + - Whitespace changes in test suite ukr modules. + +commit a0331fb10a50393e31d16339053b75b944132da1 +Author: Field G. Van Zee +Date: Thu Dec 19 14:50:11 2013 -0600 + + Introduced auxinfo_t argument to micro-kernels. + + Details: + - Removed a_next and b_next arguments to micro-kernels and replaced them + with a pointer to a new datatype, auxinfo_t, which is simply a struct + that holds a_next and b_next. The struct may hold other auxiliary + information that may be useful to a micro-kernel, such as micro-panel + stride. Micro-kernels may access struct fields via accessor macros + defined in bli_auxinfo_macro_defs.h. + - Updated all instances of micro-kernel definitions, micro-kernel calls, + as well as macro-kernels (for declaring and initializing the structs) + according to above change. + +commit 392428dea4001fe4384efe29f6cde32f8abeeb35 +Author: Field G. Van Zee +Date: Thu Dec 12 19:01:47 2013 -0600 + + Added "ri" scalar macros. + + Details: + - Added set of basic scalar macros that take arguments' real and + imaginary components separately, named like the previous set except + with the "ris" (instead of "s") suffix. + - Redefined the previous set of scalar macros (those that take arguments + "whole") in terms of the new "ri" set. + - Renamed setris and getris macros to sets and gets. + - Renamed setimag0 macros to seti0s. + - Use bli_?1 macro instead of a local constant in bla_trmv.c, bla_trsv.c. + +commit f60c8adc2f61eaba06b892f4e73000159de93056 +Author: Field G. Van Zee +Date: Tue Dec 10 14:39:56 2013 -0600 + + Minor updates to dunnington configuration. + + Details: + - Added commented alternatives to dunnington configuration's bli_kernel.h. + - Minor reformatting of optimization flag variables in make_defs.mk. + +commit 4ef20150492db254b5baf2368add62e19b0ac11b +Author: Field G. Van Zee +Date: Mon Dec 9 18:53:03 2013 -0600 + + Tweaks to dunnington configuration (x86_64/core2). + + Details: + - Updated BLIS_DEFAULT_KC_D from 256 to 384. + - Enabled cache blocksize extension of up to 25% for MC and KC (for + double-precision real). + +commit 5ad2ce7bf5ba3ea955e6d517bfd270e02820263b +Author: Field G. Van Zee +Date: Mon Dec 9 18:30:49 2013 -0600 + + Minor x86_64 (core2) kernel fixes. + + Details: + - Fixed copy-and-paste bug whereby [scz]gemmtrsm_u_opt_d4x4 kernels + for x86_64/core2 were calling the wrong reference code (l instead + of u). + - Fixed some unused variables in x86_64/core2 dotaxpyv and dotxaxpyf + kernels. + - Minor typecasting fix in testsuite/src/test_libblis.c. + - Makefile updates. + +commit d289f5d3a9c0e1a68a17c1c32b736e282a289c4c +Author: Field G. Van Zee +Date: Thu Dec 5 10:56:13 2013 -0600 + + Whitespace changes to level-2 blocked variants. + + Details: + - Joined some lines in level-2 blocked variants to match formatting used + in level-3 blocked variants. + - Streamlined implementation of bli_obj_equals() in bli_query.c. + +commit b444489f100d218bc8ef29b01ff8489c358559f9 +Author: Field G. Van Zee +Date: Tue Dec 3 16:08:30 2013 -0600 + + Added new "attached" scalar representation. + + Details: + - Added infrastructure to support a new scalar representation, whereby + every object contains an internal scalar that defaults to 1.0. This + facilitates passing scalars around without having to house them in + separate objects. These "attached" scalars are stored in the internal + atom_t field of the obj_t struct, and are always stored to be the same + datatype as the object to which they are attached. Level-3 variants no + longer take scalar arguments, however, level-3 internal back-ends stll + do; this is so that the calling function can perform subproblems such + as C := C - alpha * A * B on-the-fly without needing to change either + of the scalars attached to A or B. + - Removed scalar argument from packm_int(). + - Observe and apply attached scalars in scalm_int(), and removed scalar + from interface of scalm_unb_var1(). + - Renamed the following functions (and corresponding invocations): + + bli_obj_init_scalar_copy_of() + -> bli_obj_scalar_init_detached_copy_of() + bli_obj_init_scalar() -> bli_obj_scalar_init_detached() + bli_obj_create_scalar_with_attached_buffer() + -> bli_obj_create_1x1_with_attached_buffer() + bli_obj_scalar_equals() -> bli_obj_equals() + + - Defined new functions: + + bli_obj_scalar_detach() + bli_obj_scalar_attach() + bli_obj_scalar_apply_scalar() + bli_obj_scalar_reset() + bli_obj_scalar_has_nonzero_imag() + bli_obj_scalar_equals() + + - Placed all bli_obj_scalar_* functions in a new file, bli_obj_scalar.c. + - Renamed the following macros: + + bli_obj_scalar_buffer() -> bli_obj_buffer_for_1x1() + bli_obj_is_scalar() -> bli_obj_is_1x1() + + - Defined new macros to set and copy internal scalars between objects: + + bli_obj_set_internal_scalar() + bli_obj_copy_internal_scalar() + + - In level-3 internal back-ends, added conditional blocks where alpha and + beta are checked for non-unit-ness. Those values for alpha and beta are + applied to the scalars attached to aliases of A/B/C, as appropriate, + before being passed into the variant specified by the control tree. + - In level-3 blocked variants, pass BLIS_ONE into subproblems instead of + alpha and/or beta. + - In level-3 macro-kernels, changed how scalars are obtained. Now, scalars + attached to A and B are multiplied together to obtain alpha, while beta + is obtained directly from C. + - In level-3 front-ends, removed old function calls meant to provide + future support for mixed domain/precision. These can be added back later + once that functionality is given proper treatment. Also, removed the + creating of copy-casts of alpha and beta since typecasting of scalars + is now implicitly handled in the internal back-ends when alpha and + beta are applied to the attached scalars. + +commit 992de486d6f23e69a623abd15ae77d7881d13871 +Merge: 9552e6e fd4ac63 +Author: Field G. Van Zee +Date: Mon Dec 2 13:58:46 2013 -0600 + + Unimplemented kernels now call reference. + + Details: + - Updated arm, bgq, loongson3a, and x86_64 kernels so that unimplemented + datatypes call the corresponding reference kernel. Previously, these + kernel functions called abort() with a "not yet implemented" error + message. + +commit fd4ac636d9a55cec1476a444bd4e70def219dc8f +Author: Field G. Van Zee +Date: Mon Dec 2 13:50:36 2013 -0600 + + Unimplemented kernels now call reference. + + Details: + - Updated micro-kernels for arm, bgq, loongson3a, and x86_64 so that + unimplemented kernel functions simply call the corresponding reference + implementation. (Previously, these unimplemented functions would + abort() with a "not yet implemented" message.) + +commit 9552e6ee824d4345d5e908e869e071d19829819a +Author: Field G. Van Zee +Date: Sun Nov 24 11:40:31 2013 -0600 + + Removed optional scaling from packm control tree. + + Details: + - Removed does_scale field from packm control tree node and + bli_packm_cntl_obj_create() interface. Adjusted all invocations of + _cntl_obj_create() accordingly. + - Redefined/renamted macros that are used in aliasing so that now, + bli_obj_alias_to() does a full alias (shallow copy) while + bli_obj_alias_for_packing() does a partial alias that preserves the + pack_mem-related fields of the aliasing (destination) object. + - Removed bli_trmm3_cntl.c, .h after realizing that the trmm control tree + will work just fine for bli_trmm3(). + - Removed some commented vestiges of the typecasting functionality needed + to support heterogeneous datatypes. + +commit e65c476284db9ef64b23191a21c2584b1083342f +Author: Field G. Van Zee +Date: Tue Nov 19 10:05:35 2013 -0600 + + Minor updates to packm_blk_var2.c and _blk_var3.c. + + Details: + - Comment updates to packm_blk_var2.c and packm_blk_var3.c. + - In packm_blk_var2(), call setm_unb_var1(), scal2m_unb_var1() directly + instead of setm(), scal2m(). + +commit 9e1d0d4bca48eda54301d8976f203e2544c9df3a +Author: Field G. Van Zee +Date: Mon Nov 18 18:11:07 2013 -0600 + + Added trsm_l, trsm_u ukernels for x86_64/core2. + + Details: + - Added standalone trsm_l/trsm_u micro-kernels for x86_64 (core2). + These kernels are based on the gemmtrsm_l/gemmtrsm_u micro-kernels + that already existed in kernels/x86_64/core2-sse3/3. + +commit 85e7e02ea3a9190b6fcff5d46b00d41c79cb1242 +Merge: 67761e2 7072005 +Author: Field G. Van Zee +Date: Mon Nov 18 12:02:00 2013 -0600 + + Merge branch 'master'. Forgot to git-pull. + +commit 67761e224c92500eecf9c1540cc72bdd2fb27679 +Author: Field G. Van Zee +Date: Mon Nov 18 11:57:40 2013 -0600 + + Attempting to fix errors in bgq build. + + Details: + - Removed restrict declaration from b_cast and c_cast from + bli_trsm_lu_ker_var2.c and bli_trsm_rl_ker_var2.c. Curiously, they + are causing problems for xlc only in those two files and no other + macro-kernels. + - Fixed (hopefully) kernel function parameter type declarations in + kernels/bgq/1f/bli_axpyf_opt_var1.c and kernels/bgq/3/bli_gemm_8x8.c. + +commit 707200541d344f98cf34c9801954dbb36fbe0447 +Author: Field G. Van Zee +Date: Mon Nov 18 11:17:31 2013 -0600 + + Syntax error fix in x86_64/core2 gemmtrsm_u ukr. + +commit bbe2b84a49e7785d4d0c514cda34adfbe66478b0 +Author: Field G. Van Zee +Date: Mon Nov 18 11:11:06 2013 -0600 + + Updated Makefile in test, testsuite. + + Details: + - Updated Makefiles in test and testsuite directories to use the new + BLIS header installation directory scheme, which is to compile with + -I/include/blis instead of -I/include. + +commit 9bd7fcfd436625ca2108128086671319362f4d92 +Author: Field G. Van Zee +Date: Mon Nov 18 10:58:09 2013 -0600 + + Outer-to-inner 'restrict' fix in macro-kernels. + + Details: + - Fixed sloppy placement of 'restrict' pointer declarations in level-3 + macro-kernels. Previously, all restricted pointers were being declared + at the outer-most function scope level. While this violates the C99 + standard, very few of the compilers used with BLIS so far have seemed + to care. The lone exception has been IBM's xlc. Thanks to Tyler Smith + for identifying this bug (and suggesting the fix). + +commit 50549a6a31dd26cf63a013e0ede16b2c7ce835b6 +Author: Field G. Van Zee +Date: Sun Nov 17 18:31:27 2013 -0600 + + Changed header install directory to include/blis. + + Details: + - Changed top-level Makefile so that headers are installed to + $(INSTALL_PREFIX)/include/blis/. (Header directories are no longer + named by version/configuration and then symlinked.) + - Added uninstall targets, including uninstall-old to clean out old + library archives. + - Added GREP makefile definitions to all configurations' make_defs.mk. + +commit d70733abddfb9a95661897e1e4f3c1f3cfa7cbaa +Author: Field G. Van Zee +Date: Sat Nov 16 17:34:25 2013 -0600 + + Added ARM kernels, configurations. + + Details: + - Added kernels for ARM, and configurations for Cortex-A9 and Cortex-A15. + Thanks to Francisco Igual for contributing these kernels and + configurations. + +commit d37c2cff62089c86983c2f79762f4b5329037373 +Author: Field G. Van Zee +Date: Wed Nov 13 10:47:11 2013 -0600 + + Minor comment and Makefile changes. + + Details: + - Added missing 'check-config' and 'check-make-defs' targets to + testsuite/Makefile. + - Removed unused 'test' target from top-level Makefile. + - Comment changes to testsuite input files. + +commit 19885f893a17b91ee79bead0620d0f913392d4c5 +Author: Field G. Van Zee +Date: Mon Nov 11 12:09:21 2013 -0600 + + Updated some kernel comment headers. + + Details: + - Updated bgq and piledriver comment headers to use BLIS copyright header + instead of libflame. + +commit 1a4d698f42981d74fe5f29b980031e1ee7dc42d5 +Author: Field G. Van Zee +Date: Mon Nov 11 10:15:40 2013 -0600 + + CHANGELOG update (for 0.1.0). + +commit 089048d5895a30221b6b1976c9be93ad6443420d (tag: 0.1.0) Author: Field G. Van Zee Date: Sat Nov 9 17:18:00 2013 -0600 diff --git a/Makefile b/Makefile index d0dbeaf09..c6636932a 100644 --- a/Makefile +++ b/Makefile @@ -317,18 +317,26 @@ CFLAGS_KERNELS := $(CFLAGS_KERNELS) $(VERS_DEF) # Convert source file paths to object file paths by replacing the base source # directories with the base object directories, and also replacing the source # file suffix (eg: '.c') with '.o'. -MK_BLIS_CONFIG_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ +MK_BLIS_FRAME_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ $(filter %.c, $(MK_FRAME_SRC))) -MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ +MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ $(filter %.c, $(MK_FRAME_NOOPT_SRC))) -MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ +MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \ $(filter %.c, $(MK_FRAME_KERNELS_SRC))) -MK_BLIS_FRAME_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ +MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \ + $(filter %.S, $(MK_CONFIG_SRC))) +MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ $(filter %.c, $(MK_CONFIG_SRC))) -MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ + +MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \ + $(filter %.S, $(MK_CONFIG_NOOPT_SRC))) +MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ $(filter %.c, $(MK_CONFIG_NOOPT_SRC))) -MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ + +MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \ + $(filter %.S, $(MK_CONFIG_KERNELS_SRC))) +MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \ $(filter %.c, $(MK_CONFIG_KERNELS_SRC))) # Combine all of the object files into some readily-accessible variables. @@ -427,7 +435,7 @@ else @$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@ endif -$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.c $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH) +$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.[cS] $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH) ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) $(CC) $(call get_cflags_for_obj,$@) -c $< -o $@ else diff --git a/config/armv7a/bli_config.h b/config/armv7a/bli_config.h new file mode 100644 index 000000000..bf01caefe --- /dev/null +++ b/config/armv7a/bli_config.h @@ -0,0 +1,165 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_CONFIG_H +#define BLIS_CONFIG_H + + +// -- OPERATING SYSTEM --------------------------------------------------------- + + + +// -- INTEGER PROPERTIES ------------------------------------------------------- + +// The bit size of the integer type used to track values such as dimensions, +// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed +// integers while 64 results in 64-bit integers. Any other value results in use +// of the C99 type "long int". Note that this ONLY affects integers used +// internally within BLIS as well as those exposed in the native BLAS-like BLIS +// interface. +#define BLIS_INT_TYPE_SIZE 32 + + + +// -- FLOATING-POINT PROPERTIES ------------------------------------------------ + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// Enable use of built-in C99 "float complex" and "double complex" types and +// associated overloaded operations and functions? Disabling results in +// scomplex and dcomplex being defined in terms of simple structs. +//#define BLIS_ENABLE_C99_COMPLEX + + + +// -- MULTITHREADING ----------------------------------------------------------- + +// The maximum number of BLIS threads that will run concurrently. +#define BLIS_MAX_NUM_THREADS 1 + + + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +// -- Contiguous (static) memory allocator -- + +// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the +// contiguous memory pools. +#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS +#define BLIS_NUM_MC_X_NC_BLOCKS 0 + +// The maximum preload byte offset is used to pad the end of the contiguous +// memory pools so that the micro-kernel, when computing with the end of the +// last block, can exceed the bounds of the usable portion of the memory +// region without causing a segmentation fault. +#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128 + +// -- Memory alignment -- + +// It is sometimes useful to define the various memory alignments in terms +// of some other characteristics of the system, such as the cache line size +// and the page size. +#define BLIS_CACHE_LINE_SIZE 32 +#define BLIS_PAGE_SIZE 4096 + +// Alignment size needed by the instruction set for aligned SIMD/vector +// instructions. +#define BLIS_SIMD_ALIGN_SIZE 32 + +// Alignment size used to align local stack buffers within macro-kernel +// functions. +#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when allocating memory dynamically from the operating +// system (eg: posix_memalign()). To disable heap alignment and just use +// malloc() instead, set this to 1. +#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE + +// Alignment size used when sizing leading dimensions of dynamically +// allocated memory. +#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE + +// Alignment size used when allocating entire blocks of contiguous memory +// from the contiguous memory allocator. +#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE + + + +// -- MIXED DATATYPE SUPPORT --------------------------------------------------- + +// Basic (homogeneous) datatype support always enabled. + +// Enable mixed domain operations? +//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT + +// Enable extra mixed precision operations? +//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT + + + +// -- MISCELLANEOUS OPTIONS ---------------------------------------------------- + +// Stay initialized after auto-initialization, unless and until the user +// explicitly calls bli_finalize(). +#define BLIS_ENABLE_STAY_AUTO_INITIALIZED + + + +// -- BLAS-to-BLIS COMPATIBILITY LAYER ----------------------------------------- + +// Enable the BLAS compatibility layer? +#define BLIS_ENABLE_BLAS2BLIS + +// The bit size of the integer type used to track values such as dimensions and +// leading dimensions (ie: column strides) within the BLAS compatibility layer. +// A value of 32 results in the compatibility layer using 32-bit signed integers +// while 64 results in 64-bit integers. Any other value results in use of the +// C99 type "long int". Note that this ONLY affects integers used within the +// BLAS compatibility layer. +#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32 + +// Fortran-77 name-mangling macros. +#define PASTEF770(name) name ## _ +#define PASTEF77(ch1,name) ch1 ## name ## _ +#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ + + + + +#endif + diff --git a/config/armv7a/bli_kernel.h b/config/armv7a/bli_kernel.h new file mode 100644 index 000000000..74f77b950 --- /dev/null +++ b/config/armv7a/bli_kernel.h @@ -0,0 +1,216 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_H +#define BLIS_KERNEL_H + + +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Cache blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// (3) KC must be a multiple of +// (a) MR and +// (b) NR (for triangular operations such as trmm and trsm). +// + +#define BLIS_DEFAULT_MC_S 432 +#define BLIS_DEFAULT_KC_S 352 +#define BLIS_DEFAULT_NC_S 4096 + +#define BLIS_DEFAULT_MC_D 192 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 4096 + +#define BLIS_DEFAULT_MC_C 64 +#define BLIS_DEFAULT_KC_C 128 +#define BLIS_DEFAULT_NC_C 4096 + +#define BLIS_DEFAULT_MC_Z 64 +#define BLIS_DEFAULT_KC_Z 128 +#define BLIS_DEFAULT_NC_Z 4096 + +// -- Register blocksizes -- + +#define BLIS_DEFAULT_MR_S 4 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 4 + +#define BLIS_DEFAULT_MR_C 2 +#define BLIS_DEFAULT_NR_C 2 + +#define BLIS_DEFAULT_MR_Z 2 +#define BLIS_DEFAULT_NR_Z 2 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) + +// -- Register blocksize extensions (for packed micro-panels) -- + +// NOTE: These register blocksize "extensions" determine whether the +// leading dimensions used within the packed micro-panels are equal to +// or greater than their corresponding register blocksizes above. + +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 + +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 + +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 + +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 + + + + +// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- + + + + +// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ + + + + +// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- + +// -- gemm -- + +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 +#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4 +#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4 + +// -- trsm-related -- + + + + +// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- + +// -- packm -- + +// -- unpackm -- + + + + +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- + +// -- axpy2v -- + +// -- dotaxpyv -- + +// -- axpyf -- + +// -- dotxf -- + +// -- dotxaxpyf -- + + + + +// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- + +// -- addv -- + +// -- axpyv -- + +// -- copyv -- + +// -- dotv -- + +// -- dotxv -- + +// -- invertv -- + +// -- scal2v -- + +// -- scalv -- + +// -- setv -- + +// -- subv -- + +// -- swapv -- + + + +#endif + diff --git a/config/armv7a/kernels b/config/armv7a/kernels new file mode 120000 index 000000000..c40c02857 --- /dev/null +++ b/config/armv7a/kernels @@ -0,0 +1 @@ +../../kernels/armv7a \ No newline at end of file diff --git a/config/armv7a/make_defs.mk b/config/armv7a/make_defs.mk new file mode 100644 index 000000000..0b13d31e1 --- /dev/null +++ b/config/armv7a/make_defs.mk @@ -0,0 +1,108 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# Only include this block of code once. +ifndef MAKE_DEFS_MK_INCLUDED +MAKE_DEFS_MK_INCLUDED := yes + + + +# +# --- Build definitions -------------------------------------------------------- +# + +# Variables corresponding to other configure-time options. +BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes +BLIS_ENABLE_STATIC_BUILD := yes +BLIS_ENABLE_DYNAMIC_BUILD := no + + + +# +# --- Utility program definitions ---------------------------------------------- +# + +SH := /bin/sh +MV := mv +MKDIR := mkdir -p +RM_F := rm -f +RM_RF := rm -rf +SYMLINK := ln -sf +FIND := find +GREP := grep +XARGS := xargs +RANLIB := ranlib +INSTALL := install -c + +# Used to refresh CHANGELOG. +GIT := git +GIT_LOG := $(GIT) log --decorate + + + +# +# --- Development tools definitions -------------------------------------------- +# + +# --- Determine the C compiler and related flags --- +CC := gcc +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g +CDBGFLAGS := #-g +CWARNFLAGS := -Wall +COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g +CKOPTFLAGS := $(COPTFLAGS) +CVECFLAGS := #-msse3 # -mfpmath=sse + +# Aggregate all of the flags into multiple groups: one for standard +# compilation, and one for each of the supported "special" compilation +# modes. +CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) +CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS) + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cru + +# --- Determine the linker and related flags --- +LINKER := $(CC) +LDFLAGS := -lm + + + +# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block +endif diff --git a/config/bgq/bli_config.h b/config/bgq/bli_config.h index b486d86e3..39a627a6a 100644 --- a/config/bgq/bli_config.h +++ b/config/bgq/bli_config.h @@ -36,6 +36,9 @@ #define BLIS_CONFIG_H +#undef restrict + + // -- OPERATING SYSTEM --------------------------------------------------------- @@ -118,10 +121,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32 - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/bgq/bli_kernel.h b/config/bgq/bli_kernel.h index 602073aae..02d9c89b5 100644 --- a/config/bgq/bli_kernel.h +++ b/config/bgq/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -76,35 +76,7 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -122,10 +94,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -133,48 +129,22 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 - -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 @@ -182,44 +152,26 @@ // -- Default fusing factors for level-1f operations -- -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. +#define BLIS_L1F_FUSE_FAC_S 8 +#define BLIS_L1F_FUSE_FAC_D 4 +#define BLIS_L1F_FUSE_FAC_C 4 +#define BLIS_L1F_FUSE_FAC_Z 2 -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 +#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z +#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z +#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -229,16 +181,11 @@ #include "bli_gemm_8x8.h" -#define GEMM_UKERNEL gemm_8x8 -#define GEMM_UKERNEL_MT gemm_8x8_mt +#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8 +#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -246,25 +193,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -272,25 +202,16 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#include "bli_axpyf_opt_var1.h" - -#define AXPYF_KERNEL axpyf_opt_var1 +#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -298,52 +219,30 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#include "bli_axpyv_opt_var1.h" - -#define AXPYV_KERNEL axpyv_opt_var1 +#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#include "bli_dotv_opt_var1.h" - -#define DOTV_KERNEL dotv_opt_var1 +#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/cortex-a15/bli_config.h b/config/cortex-a15/bli_config.h index b779d59df..f6be2e573 100644 --- a/config/cortex-a15/bli_config.h +++ b/config/cortex-a15/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/cortex-a15/bli_kernel.h b/config/cortex-a15/bli_kernel.h index 6db9e2e61..d5f97b26f 100644 --- a/config/cortex-a15/bli_kernel.h +++ b/config/cortex-a15/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,93 +123,27 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 - -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -221,16 +151,11 @@ // -- gemm -- -#include "bli_gemm_opt_4x4.h" -#define GEMM_UKERNEL gemm_opt_4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -238,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -264,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -288,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/cortex-a9/bli_config.h b/config/cortex-a9/bli_config.h index b779d59df..f6be2e573 100644 --- a/config/cortex-a9/bli_config.h +++ b/config/cortex-a9/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/cortex-a9/bli_kernel.h b/config/cortex-a9/bli_kernel.h index 8facfe24f..bf3ce7a59 100644 --- a/config/cortex-a9/bli_kernel.h +++ b/config/cortex-a9/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,93 +123,27 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 - -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -221,16 +151,11 @@ // -- gemm -- -#include "bli_gemm_opt_4x4.h" -#define GEMM_UKERNEL gemm_opt_4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -238,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -264,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -288,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/dunnington/bli_config.h b/config/dunnington/bli_config.h index badab1d5d..22fc0a412 100644 --- a/config/dunnington/bli_config.h +++ b/config/dunnington/bli_config.h @@ -69,7 +69,7 @@ // -- MULTITHREADING ----------------------------------------------------------- // The maximum number of BLIS threads that will run concurrently. -#define BLIS_MAX_NUM_THREADS 1 +#define BLIS_MAX_NUM_THREADS 24 @@ -80,7 +80,7 @@ // The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the // contiguous memory pools. #define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS -#define BLIS_NUM_KC_X_NC_BLOCKS 1 +#define BLIS_NUM_KC_X_NC_BLOCKS 4 #define BLIS_NUM_MC_X_NC_BLOCKS 0 // The maximum preload byte offset is used to pad the end of the contiguous @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16 - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/dunnington/bli_kernel.h b/config/dunnington/bli_kernel.h index 0ec647d64..0c5ebbdb9 100644 --- a/config/dunnington/bli_kernel.h +++ b/config/dunnington/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -55,20 +55,63 @@ // #define BLIS_DEFAULT_MC_S 768 -#define BLIS_DEFAULT_KC_S 256 -#define BLIS_DEFAULT_NC_S 8192 +#define BLIS_DEFAULT_KC_S 384 +#define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 -#define BLIS_DEFAULT_MC_C 128 -#define BLIS_DEFAULT_KC_C 256 -#define BLIS_DEFAULT_NC_C 4096 +//#define BLIS_DEFAULT_MC_C 384 +//#define BLIS_DEFAULT_KC_C 384 +//#define BLIS_DEFAULT_NC_C 4096 -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 256 -#define BLIS_DEFAULT_NC_Z 2048 +//#define BLIS_DEFAULT_MC_Z 192 +//#define BLIS_DEFAULT_KC_Z 384 +//#define BLIS_DEFAULT_NC_Z 4096 + +// NOTE: If 4m blocksizes are not defined here, they will be determined +// from the corresponding real domain blocksizes. +#define BLIS_DEFAULT_4M_MC_C 384 +#define BLIS_DEFAULT_4M_KC_C 512 +#define BLIS_DEFAULT_4M_NC_C 4096 + +#define BLIS_DEFAULT_4M_MC_Z 192 +#define BLIS_DEFAULT_4M_KC_Z 256 +#define BLIS_DEFAULT_4M_NC_Z 4096 + +// NOTE: If 3m blocksizes are not defined here, they will be determined +// from the corresponding real domain blocksizes. +#define BLIS_DEFAULT_3M_MC_C 384 +#define BLIS_DEFAULT_3M_KC_C 512 +#define BLIS_DEFAULT_3M_NC_C 4096 + +#define BLIS_DEFAULT_3M_MC_Z 192 +#define BLIS_DEFAULT_3M_KC_Z 256 +#define BLIS_DEFAULT_3M_NC_Z 4096 + +// -- Register blocksizes -- + +#define BLIS_DEFAULT_MR_S 8 +#define BLIS_DEFAULT_NR_S 4 + +#define BLIS_DEFAULT_MR_D 4 +#define BLIS_DEFAULT_NR_D 4 + +#define BLIS_DEFAULT_MR_C 4 +#define BLIS_DEFAULT_NR_C 2 + +#define BLIS_DEFAULT_MR_Z 2 +#define BLIS_DEFAULT_NR_Z 2 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 // -- Cache blocksize extensions (for optimizing edge cases) -- @@ -78,48 +121,21 @@ // such an extension would encompass the remaining portion of the // matrix dimension. -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) -#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) +//#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). - -#define BLIS_DEFAULT_MR_S 8 -#define BLIS_DEFAULT_NR_S 4 - -#define BLIS_DEFAULT_MR_D 4 -#define BLIS_DEFAULT_NR_D 4 - -#define BLIS_DEFAULT_MR_C 4 -#define BLIS_DEFAULT_NR_C 4 - -#define BLIS_DEFAULT_MR_Z 4 -#define BLIS_DEFAULT_NR_Z 4 - -// NOTE: If the micro-kernel, which is typically unrolled to a factor -// of f, handles leftover edge cases (ie: when k % f > 0) then these -// register blocksizes in the k dimension can be defined to 1. - -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,234 +143,99 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 - // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- -#include "bli_gemm_opt_d4x4.h" - -#include "bli_gemmtrsm_l_opt_d4x4.h" -#include "bli_gemmtrsm_u_opt_d4x4.h" -//#include "bli_gemmtrsm_l_ref_mxn.h" -//#include "bli_gemmtrsm_u_ref_mxn.h" - -//#include "bli_trsm_l_ref_4x4.h" -//#include "bli_trsm_u_ref_4x4.h" -#include "bli_trsm_l_ref_mxn.h" -#include "bli_trsm_u_ref_mxn.h" - // -- gemm -- -#define GEMM_UKERNEL gemm_opt_d4x4 +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4 -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4 -//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn +#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_4x4 +#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_4x4 -//#define TRSM_L_UKERNEL trsm_l_ref_4x4 -//#define TRSM_U_UKERNEL trsm_u_ref_4x4 -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn - - - -// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- - -// -- packm -- - -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - -// -- unpackm -- - -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- -#include "bli_axpy2v_opt_var1.h" -#include "bli_dotaxpyv_opt_var1.h" -#include "bli_axpyf_opt_var1.h" -#include "bli_dotxf_opt_var1.h" -#include "bli_dotxaxpyf_opt_var1.h" - // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_opt_var1 +#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1 // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_opt_var1 +#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1 // -- axpyf -- -#define AXPYF_KERNEL axpyf_opt_var1 +#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 // -- dotxf -- -#define DOTXF_KERNEL dotxf_opt_var1 +#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1 // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1 +#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1 + // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- -#include "bli_axpyv_opt_var1.h" -#include "bli_dotv_opt_var1.h" - // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_opt_var1 +#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_opt_var1 +#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/dunnington/make_defs.mk b/config/dunnington/make_defs.mk index 56e57a7b3..b8af82d5d 100644 --- a/config/dunnington/make_defs.mk +++ b/config/dunnington/make_defs.mk @@ -80,10 +80,10 @@ CC := gcc # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -CMISCFLAGS := -std=c99 # -fopenmp -pg +CMISCFLAGS := -std=c99 -fopenmp #-pg CDBGFLAGS := #-g CWARNFLAGS := -Wall -COPTFLAGS := -O2 -mfpmath=sse #-fomit-frame-pointer +COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer CKOPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer CVECFLAGS := -msse3 -march=native @@ -100,7 +100,7 @@ ARFLAGS := cru # --- Determine the linker and related flags --- LINKER := $(CC) -LDFLAGS := -lm +LDFLAGS := -lm -fopenmp diff --git a/config/loongson3a/bli_config.h b/config/loongson3a/bli_config.h index 849557543..23e32e10a 100644 --- a/config/loongson3a/bli_config.h +++ b/config/loongson3a/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16 - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/loongson3a/bli_kernel.h b/config/loongson3a/bli_kernel.h index f6ff97b1c..ac18cd71c 100644 --- a/config/loongson3a/bli_kernel.h +++ b/config/loongson3a/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,111 +123,39 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- -#include "bli_gemm_opt_d4x4.h" - // -- gemm -- -#define GEMM_UKERNEL gemm_opt_d4x4 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_d4x4 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -239,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -265,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -289,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/mic/bli_config.h b/config/mic/bli_config.h index 3c8250292..637e71f74 100644 --- a/config/mic/bli_config.h +++ b/config/mic/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/mic/bli_kernel.h b/config/mic/bli_kernel.h index 1962b7eb0..7114adf52 100644 --- a/config/mic/bli_kernel.h +++ b/config/mic/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4) +#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4) +#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,93 +123,28 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 #define BLIS_EXTEND_MR_D 2 #define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -221,17 +152,10 @@ // -- gemm -- -#include "bli_gemm_opt_30x8.h" - -#define GEMM_UKERNEL gemm_opt_30x8 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -239,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -265,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -289,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/piledriver/bli_config.h b/config/piledriver/bli_config.h index 1115eb930..57ace8ba1 100644 --- a/config/piledriver/bli_config.h +++ b/config/piledriver/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/piledriver/bli_kernel.h b/config/piledriver/bli_kernel.h index 073445862..e904f61d7 100644 --- a/config/piledriver/bli_kernel.h +++ b/config/piledriver/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,93 +123,28 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -221,17 +152,10 @@ // -- gemm -- -#include "bli_gemm_4x6.h" - -#define GEMM_UKERNEL gemm_4x6 +#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -239,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -265,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -289,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/power7/bli_config.h b/config/power7/bli_config.h index 388828027..3e32f5367 100644 --- a/config/power7/bli_config.h +++ b/config/power7/bli_config.h @@ -116,10 +116,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16 - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/power7/bli_kernel.h b/config/power7/bli_kernel.h index 3a6d28d44..f63ad4690 100644 --- a/config/power7/bli_kernel.h +++ b/config/power7/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,93 +123,28 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 @@ -221,18 +152,12 @@ // -- gemm -- -//#define GEMM_UKERNEL gemm_ref_mxn - #include "bli_gemm_opt_8x4.h" -#define GEMM_UKERNEL gemm_opt_8x4 + +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -240,25 +165,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -266,23 +174,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -290,48 +189,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/reference/bli_config.h b/config/reference/bli_config.h index d643a1f41..2078a080d 100644 --- a/config/reference/bli_config.h +++ b/config/reference/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/reference/bli_kernel.h b/config/reference/bli_kernel.h index f42d7385d..f6dc1dc24 100644 --- a/config/reference/bli_kernel.h +++ b/config/reference/bli_kernel.h @@ -35,300 +35,8 @@ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H - -// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- - -// -- Default cache blocksizes -- - -// -// Constraints: -// -// (1) MC must be a multiple of: -// (a) MR (for zero-padding purposes) -// (b) NR (for zero-padding purposes when MR and NR are "swapped") -// (2) NC must be a multiple of -// (a) NR (for zero-padding purposes) -// (b) MR (for zero-padding purposes when MR and NR are "swapped") -// (3) KC must be a multiple of -// (a) MR and -// (b) NR (for triangular operations such as trmm and trsm). -// - -#define BLIS_DEFAULT_MC_S 64 -#define BLIS_DEFAULT_KC_S 128 -#define BLIS_DEFAULT_NC_S 4096 - -#define BLIS_DEFAULT_MC_D 64 -#define BLIS_DEFAULT_KC_D 128 -#define BLIS_DEFAULT_NC_D 4096 - -#define BLIS_DEFAULT_MC_C 64 -#define BLIS_DEFAULT_KC_C 128 -#define BLIS_DEFAULT_NC_C 4096 - -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 128 -#define BLIS_DEFAULT_NC_Z 4096 - -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). - -#define BLIS_DEFAULT_MR_S 8 -#define BLIS_DEFAULT_NR_S 4 - -#define BLIS_DEFAULT_MR_D 8 -#define BLIS_DEFAULT_NR_D 4 - -#define BLIS_DEFAULT_MR_C 8 -#define BLIS_DEFAULT_NR_C 4 - -#define BLIS_DEFAULT_MR_Z 8 -#define BLIS_DEFAULT_NR_Z 4 - -// NOTE: If the micro-kernel, which is typically unrolled to a factor -// of f, handles leftover edge cases (ie: when k % f > 0) then these -// register blocksizes in the k dimension can be defined to 1. - -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 - -// -- Register blocksize extensions (for packed micro-panels) -- - -// NOTE: These register blocksize "extensions" determine whether the -// leading dimensions used within the packed micro-panels are equal to -// or greater than their corresponding register blocksizes above. - -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 - -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 - -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 - -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 - -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 - - - -// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- - -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 - - - -// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ - -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 - - - -// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- - -// -- gemm -- - -#define GEMM_UKERNEL gemm_ref_mxn - -// -- trsm-related -- - -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn - - - -// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- - -// -- packm -- - -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - -// -- unpackm -- - -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk - - - -// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- - -// -- axpy2v -- - -#define AXPY2V_KERNEL axpy2v_unb_var1 - -// -- dotaxpyv -- - -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - -// -- axpyf -- - -#define AXPYF_KERNEL axpyf_unb_var1 - -// -- dotxf -- - -#define DOTXF_KERNEL dotxf_unb_var1 - -// -- dotxaxpyf -- - -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 - - - -// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- - -// -- addv -- - -#define ADDV_KERNEL addv_unb_var1 - -// -- axpyv -- - -#define AXPYV_KERNEL axpyv_unb_var1 - -// -- copyv -- - -#define COPYV_KERNEL copyv_unb_var1 - -// -- dotv -- - -#define DOTV_KERNEL dotv_unb_var1 - -// -- dotxv -- - -#define DOTXV_KERNEL dotxv_unb_var1 - -// -- invertv -- - -#define INVERTV_KERNEL invertv_unb_var1 - -// -- scal2v -- - -#define SCAL2V_KERNEL scal2v_unb_var1 - -// -- scalv -- - -#define SCALV_KERNEL scalv_unb_var1 - -// -- setv -- - -#define SETV_KERNEL setv_unb_var1 - -// -- subv -- - -#define SUBV_KERNEL subv_unb_var1 - -// -- swapv -- - -#define SWAPV_KERNEL swapv_unb_var1 - +// In the reference configuration, we let all of the defaults take +// effect. Thus, no definitions are needed. #endif diff --git a/config/sandybridge/bli_config.h b/config/sandybridge/bli_config.h index e721c60c1..5816b5728 100644 --- a/config/sandybridge/bli_config.h +++ b/config/sandybridge/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/sandybridge/bli_kernel.h b/config/sandybridge/bli_kernel.h index f0797452a..b3634538a 100644 --- a/config/sandybridge/bli_kernel.h +++ b/config/sandybridge/bli_kernel.h @@ -38,7 +38,7 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- // // Constraints: @@ -70,35 +70,7 @@ #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +88,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) +//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) +//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) + +//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) +//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) +//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) + +//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) +//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) +//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) + +//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) +//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) +//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,111 +123,39 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- -#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h" - // -- gemm -- -#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1 +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1 // -- trsm-related -- -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn @@ -239,25 +163,8 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - // -- unpackm -- -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk @@ -265,23 +172,14 @@ // -- axpy2v -- -#define AXPY2V_KERNEL axpy2v_unb_var1 - // -- dotaxpyv -- -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - // -- axpyf -- -#define AXPYF_KERNEL axpyf_unb_var1 - // -- dotxf -- -#define DOTXF_KERNEL dotxf_unb_var1 - // -- dotxaxpyf -- -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 @@ -289,48 +187,26 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 - // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 - // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 - // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 - // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 - // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 - // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 - // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 - // -- setv -- -#define SETV_KERNEL setv_unb_var1 - // -- subv -- -#define SUBV_KERNEL subv_unb_var1 - // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 - #endif diff --git a/config/template/bli_config.h b/config/template/bli_config.h index 01010091d..5c2734f5f 100644 --- a/config/template/bli_config.h +++ b/config/template/bli_config.h @@ -118,10 +118,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/config/template/bli_kernel.h b/config/template/bli_kernel.h index 7aeb2241d..3d84ee9e7 100644 --- a/config/template/bli_kernel.h +++ b/config/template/bli_kernel.h @@ -38,9 +38,8 @@ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- -// -- Default cache blocksizes -- +// -- Cache blocksizes -- -// // Constraints: // // (1) MC must be a multiple of: @@ -52,53 +51,24 @@ // (3) KC must be a multiple of // (a) MR and // (b) NR (for triangular operations such as trmm and trsm). -// -#define BLIS_DEFAULT_MC_S 64 -#define BLIS_DEFAULT_KC_S 128 -#define BLIS_DEFAULT_NC_S 4096 +#define BLIS_DEFAULT_MC_S 128 +#define BLIS_DEFAULT_KC_S 256 +#define BLIS_DEFAULT_NC_S 2048 -#define BLIS_DEFAULT_MC_D 64 -#define BLIS_DEFAULT_KC_D 128 -#define BLIS_DEFAULT_NC_D 4096 +#define BLIS_DEFAULT_MC_D 128 +#define BLIS_DEFAULT_KC_D 256 +#define BLIS_DEFAULT_NC_D 2048 -#define BLIS_DEFAULT_MC_C 64 -#define BLIS_DEFAULT_KC_C 128 -#define BLIS_DEFAULT_NC_C 4096 +#define BLIS_DEFAULT_MC_C 128 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 2048 -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 128 -#define BLIS_DEFAULT_NC_Z 4096 +#define BLIS_DEFAULT_MC_Z 128 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 2048 -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). +// -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 @@ -116,10 +86,34 @@ // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Cache blocksize extensions (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// non-zero, blocksizes used at edge cases are extended (enlarged) if +// such an extension would encompass the remaining portion of the +// matrix dimension. + +//#define BLIS_EXTEND_MC_S 0 +//#define BLIS_EXTEND_KC_S 0 +//#define BLIS_EXTEND_NC_S 0 + +//#define BLIS_EXTEND_MC_D 0 +//#define BLIS_EXTEND_KC_D 0 +//#define BLIS_EXTEND_NC_D 0 + +//#define BLIS_EXTEND_MC_C 0 +//#define BLIS_EXTEND_KC_C 0 +//#define BLIS_EXTEND_NC_C 0 + +//#define BLIS_EXTEND_MC_Z 0 +//#define BLIS_EXTEND_KC_Z 0 +//#define BLIS_EXTEND_NC_Z 0 // -- Register blocksize extensions (for packed micro-panels) -- @@ -127,24 +121,52 @@ // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 +//#define BLIS_EXTEND_MR_S 0 +//#define BLIS_EXTEND_NR_S 0 -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 +//#define BLIS_EXTEND_MR_D 0 +//#define BLIS_EXTEND_NR_D 0 -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 +//#define BLIS_EXTEND_MR_C 0 +//#define BLIS_EXTEND_NR_C 0 -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 +//#define BLIS_EXTEND_MR_Z 0 +//#define BLIS_EXTEND_NR_Z 0 -// Register blocksize extensions in the k dimension are not used. -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 + + +// -- LEVEL-3 MICRO-KERNELS --------------------------------------------------- + +// -- gemm -- + +#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_mxn +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_mxn +#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_mxn +#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_mxn + +// -- trsm-related -- + +#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_opt_mxn +#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_mxn +#define BLIS_CGEMMTRSM_L_UKERNEL bli_cgemmtrsm_l_opt_mxn +#define BLIS_ZGEMMTRSM_L_UKERNEL bli_zgemmtrsm_l_opt_mxn + +#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_opt_mxn +#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_mxn +#define BLIS_CGEMMTRSM_U_UKERNEL bli_cgemmtrsm_u_opt_mxn +#define BLIS_ZGEMMTRSM_U_UKERNEL bli_zgemmtrsm_u_opt_mxn + +#define BLIS_STRSM_L_UKERNEL bli_strsm_l_opt_mxn +#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_opt_mxn +#define BLIS_CTRSM_L_UKERNEL bli_ctrsm_l_opt_mxn +#define BLIS_ZTRSM_L_UKERNEL bli_ztrsm_l_opt_mxn + +#define BLIS_STRSM_U_UKERNEL bli_strsm_u_opt_mxn +#define BLIS_DTRSM_U_UKERNEL bli_dtrsm_u_opt_mxn +#define BLIS_CTRSM_U_UKERNEL bli_ctrsm_u_opt_mxn +#define BLIS_ZTRSM_U_UKERNEL bli_ztrsm_u_opt_mxn + @@ -158,17 +180,18 @@ // be packed here, but this tends to be much too expensive in practice to // actually employ.) -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 +//#define BLIS_DEFAULT_L2_MC_S 1000 +//#define BLIS_DEFAULT_L2_NC_S 1000 -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 +//#define BLIS_DEFAULT_L2_MC_D 1000 +//#define BLIS_DEFAULT_L2_NC_D 1000 -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 +//#define BLIS_DEFAULT_L2_MC_C 1000 +//#define BLIS_DEFAULT_L2_NC_C 1000 + +//#define BLIS_DEFAULT_L2_MC_Z 1000 +//#define BLIS_DEFAULT_L2_NC_Z 1000 -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 @@ -176,66 +199,67 @@ // -- Default fusing factors for level-1f operations -- -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. +//#define BLIS_L1F_FUSE_FAC_S 8 +//#define BLIS_L1F_FUSE_FAC_D 4 +//#define BLIS_L1F_FUSE_FAC_C 4 +//#define BLIS_L1F_FUSE_FAC_Z 2 -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 +//#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +//#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +//#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +//#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z +//#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +//#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +//#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +//#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z +//#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +//#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +//#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +//#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ -// -- Default register blocksizes for vectors -- +// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. +// -- axpy2v -- -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 +#define BLIS_SAXPY2V_KERNEL bli_saxpy2v_opt_var1 +#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1 +#define BLIS_CAXPY2V_KERNEL bli_caxpy2v_opt_var1 +#define BLIS_ZAXPY2V_KERNEL bli_zaxpy2v_opt_var1 + +// -- dotaxpyv -- + +#define BLIS_SDOTAXPYV_KERNEL bli_sdotaxpyv_opt_var1 +#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1 +#define BLIS_CDOTAXPYV_KERNEL bli_cdotaxpyv_opt_var1 +#define BLIS_ZDOTAXPYV_KERNEL bli_zdotaxpyv_opt_var1 + +// -- axpyf -- + +#define BLIS_SAXPYF_KERNEL bli_saxpyf_opt_var1 +#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 +#define BLIS_CAXPYF_KERNEL bli_caxpyf_opt_var1 +#define BLIS_ZAXPYF_KERNEL bli_zaxpyf_opt_var1 + +// -- dotxf -- + +#define BLIS_SDOTXF_KERNEL bli_sdotxf_opt_var1 +#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1 +#define BLIS_CDOTXF_KERNEL bli_cdotxf_opt_var1 +#define BLIS_ZDOTXF_KERNEL bli_zdotxf_opt_var1 +// -- dotxaxpyf -- -// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- +#define BLIS_SDOTXAXPYF_KERNEL bli_sdotxaxpyf_opt_var1 +#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1 +#define BLIS_CDOTXAXPYF_KERNEL bli_cdotxaxpyf_opt_var1 +#define BLIS_ZDOTXAXPYF_KERNEL bli_zdotxaxpyf_opt_var1 -#include "bli_gemm_opt_mxn.h" -#include "bli_trsm_l_opt_mxn.h" -#include "bli_trsm_u_opt_mxn.h" -#include "bli_gemmtrsm_l_opt_mxn.h" -#include "bli_gemmtrsm_u_opt_mxn.h" - -// -- gemm -- - -#define GEMM_UKERNEL gemm_opt_mxn - -// -- trsm-related -- - -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn - -#define TRSM_L_UKERNEL trsm_l_opt_mxn -#define TRSM_U_UKERNEL trsm_u_opt_mxn @@ -243,55 +267,30 @@ // -- packm -- -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk +//#define BLIS_SPACKM_2XK_KERNEL bli_spackm_ref_2xk +//#define BLIS_DPACKM_2XK_KERNEL bli_dpackm_ref_2xk +//#define BLIS_CPACKM_2XK_KERNEL bli_cpackm_ref_2xk +//#define BLIS_ZPACKM_2XK_KERNEL bli_zpackm_ref_2xk -// -- unpackm -- +//#define BLIS_SPACKM_4XK_KERNEL bli_spackm_ref_4xk +//#define BLIS_DPACKM_4XK_KERNEL bli_dpackm_ref_4xk +//#define BLIS_CPACKM_4XK_KERNEL bli_cpackm_ref_4xk +//#define BLIS_ZPACKM_4XK_KERNEL bli_zpackm_ref_4xk -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk +//#define BLIS_SPACKM_6XK_KERNEL bli_spackm_ref_6xk +//#define BLIS_DPACKM_6XK_KERNEL bli_dpackm_ref_6xk +//#define BLIS_CPACKM_6XK_KERNEL bli_cpackm_ref_6xk +//#define BLIS_ZPACKM_6XK_KERNEL bli_zpackm_ref_6xk +//#define BLIS_SPACKM_8XK_KERNEL bli_spackm_ref_8xk +//#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_ref_8xk +//#define BLIS_CPACKM_8XK_KERNEL bli_cpackm_ref_8xk +//#define BLIS_ZPACKM_8XK_KERNEL bli_zpackm_ref_8xk +// ... -// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- +// (Commented definitions for 10, 12, 14, and 16 not shown). -#include "bli_axpy2v_opt_var1.h" -#include "bli_dotaxpyv_opt_var1.h" -#include "bli_axpyf_opt_var1.h" -#include "bli_dotxf_opt_var1.h" -#include "bli_dotxaxpyf_opt_var1.h" - -// -- axpy2v -- - -#define AXPY2V_KERNEL axpy2v_opt_var1 - -// -- dotaxpyv -- - -#define DOTAXPYV_KERNEL dotaxpyv_opt_var1 - -// -- axpyf -- - -#define AXPYF_KERNEL axpyf_opt_var1 - -// -- dotxf -- - -#define DOTXF_KERNEL dotxf_opt_var1 - -// -- dotxaxpyf -- - -#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1 @@ -299,47 +298,81 @@ // -- addv -- -#define ADDV_KERNEL addv_unb_var1 +//#define BLIS_SADDV_KERNEL bli_saddv_unb_var1 +//#define BLIS_DADDV_KERNEL bli_daddv_unb_var1 +//#define BLIS_CADDV_KERNEL bli_caddv_unb_var1 +//#define BLIS_ZADDV_KERNEL bli_zaddv_unb_var1 // -- axpyv -- -#define AXPYV_KERNEL axpyv_unb_var1 +#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var1 +#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 +#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt_var1 +#define BLIS_ZAXPYV_KERNEL bli_zaxpyv_opt_var1 // -- copyv -- -#define COPYV_KERNEL copyv_unb_var1 +//#define BLIS_SCOPYV_KERNEL bli_scopyv_unb_var1 +//#define BLIS_DCOPYV_KERNEL bli_dcopyv_unb_var1 +//#define BLIS_CCOPYV_KERNEL bli_ccopyv_unb_var1 +//#define BLIS_ZCOPYV_KERNEL bli_zcopyv_unb_var1 // -- dotv -- -#define DOTV_KERNEL dotv_unb_var1 +#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1 +#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 +#define BLIS_CDOTV_KERNEL bli_cdotv_opt_var1 +#define BLIS_ZDOTV_KERNEL bli_zdotv_opt_var1 // -- dotxv -- -#define DOTXV_KERNEL dotxv_unb_var1 +//#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1 +//#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1 +//#define BLIS_CDOTXV_KERNEL bli_cdotxv_unb_var1 +//#define BLIS_ZDOTXV_KERNEL bli_zdotxv_unb_var1 // -- invertv -- -#define INVERTV_KERNEL invertv_unb_var1 +//#define BLIS_SINVERTV_KERNEL bli_sinvertv_unb_var1 +//#define BLIS_DINVERTV_KERNEL bli_dinvertv_unb_var1 +//#define BLIS_CINVERTV_KERNEL bli_cinvertv_unb_var1 +//#define BLIS_ZINVERTV_KERNEL bli_zinvertv_unb_var1 // -- scal2v -- -#define SCAL2V_KERNEL scal2v_unb_var1 +//#define BLIS_SSCAL2V_KERNEL bli_sscal2v_unb_var1 +//#define BLIS_DSCAL2V_KERNEL bli_dscal2v_unb_var1 +//#define BLIS_CSCAL2V_KERNEL bli_cscal2v_unb_var1 +//#define BLIS_ZSCAL2V_KERNEL bli_zscal2v_unb_var1 // -- scalv -- -#define SCALV_KERNEL scalv_unb_var1 +//#define BLIS_SSCALV_KERNEL bli_sscalv_unb_var1 +//#define BLIS_DSCALV_KERNEL bli_dscalv_unb_var1 +//#define BLIS_CSCALV_KERNEL bli_cscalv_unb_var1 +//#define BLIS_ZSCALV_KERNEL bli_zscalv_unb_var1 // -- setv -- -#define SETV_KERNEL setv_unb_var1 +//#define BLIS_SSETV_KERNEL bli_ssetv_unb_var1 +//#define BLIS_DSETV_KERNEL bli_dsetv_unb_var1 +//#define BLIS_CSETV_KERNEL bli_csetv_unb_var1 +//#define BLIS_ZSETV_KERNEL bli_zsetv_unb_var1 // -- subv -- -#define SUBV_KERNEL subv_unb_var1 +//#define BLIS_SSUBV_KERNEL bli_ssubv_unb_var1 +//#define BLIS_DSUBV_KERNEL bli_dsubv_unb_var1 +//#define BLIS_CSUBV_KERNEL bli_csubv_unb_var1 +//#define BLIS_ZSUBV_KERNEL bli_zsubv_unb_var1 // -- swapv -- -#define SWAPV_KERNEL swapv_unb_var1 +//#define BLIS_SSWAPV_KERNEL bli_sswapv_unb_var1 +//#define BLIS_DSWAPV_KERNEL bli_dswapv_unb_var1 +//#define BLIS_CSWAPV_KERNEL bli_cswapv_unb_var1 +//#define BLIS_ZSWAPV_KERNEL bli_zswapv_unb_var1 + diff --git a/config/template/kernels/1/bli_axpyv_opt_var1.c b/config/template/kernels/1/bli_axpyv_opt_var1.c index be9b11071..42f842483 100644 --- a/config/template/kernels/1/bli_axpyv_opt_var1.c +++ b/config/template/kernels/1/bli_axpyv_opt_var1.c @@ -36,59 +36,59 @@ -void bli_sssaxpyv_opt_var1( conj_t conjx, - dim_t n, - float* restrict alpha, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy ) +void bli_saxpyv_opt_var1( conj_t conjx, + dim_t n, + float* restrict alpha, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy ) { /* Just call the reference implementation. */ - bli_sssaxpyv_unb_var1( conjx, - n, - alpha, - x, incx, - y, incy ); + BLIS_SAXPYV_KERNEL_REF( conjx, + n, + alpha, + x, incx, + y, incy ); } -void bli_dddaxpyv_opt_var1( conj_t conjx, - dim_t n, - double* restrict alpha, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy ) +void bli_daxpyv_opt_var1( conj_t conjx, + dim_t n, + double* restrict alpha, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy ) { /* Just call the reference implementation. */ - bli_dddaxpyv_unb_var1( conjx, - n, - alpha, - x, incx, - y, incy ); + BLIS_DAXPYV_KERNEL_REF( conjx, + n, + alpha, + x, incx, + y, incy ); } -void bli_cccaxpyv_opt_var1( conj_t conjx, - dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy ) +void bli_caxpyv_opt_var1( conj_t conjx, + dim_t n, + scomplex* restrict alpha, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy ) { /* Just call the reference implementation. */ - bli_cccaxpyv_unb_var1( conjx, - n, - alpha, - x, incx, - y, incy ); + BLIS_CAXPYV_KERNEL_REF( conjx, + n, + alpha, + x, incx, + y, incy ); } -void bli_zzzaxpyv_opt_var1( conj_t conjx, - dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy ) +void bli_zaxpyv_opt_var1( conj_t conjx, + dim_t n, + dcomplex* restrict alpha, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy ) { /* Template axpyv kernel implementation @@ -193,11 +193,11 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx, // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzaxpyv_unb_var1( conjx, - n, - alpha, - x, incx, - y, incy ); + BLIS_ZAXPYV_KERNEL_REF( conjx, + n, + alpha, + x, incx, + y, incy ); return; } @@ -272,37 +272,3 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx, } } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3 -#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,opname)( \ - conj_t conjx, \ - dim_t n, \ - ctype_a* restrict alpha, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(cha,chx,chy,varname)( conjx, \ - n, \ - alpha, \ - x, incx, \ - y, incy ); \ -} - - - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 ) -#endif - diff --git a/config/template/kernels/1/bli_dotv_opt_var1.c b/config/template/kernels/1/bli_dotv_opt_var1.c index ff4e8c5bf..bc16282cb 100644 --- a/config/template/kernels/1/bli_dotv_opt_var1.c +++ b/config/template/kernels/1/bli_dotv_opt_var1.c @@ -36,66 +36,66 @@ -void bli_sssdotv_opt_var1( conj_t conjx, - conj_t conjy, - dim_t n, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy, - float* restrict rho ) +void bli_sdotv_opt_var1( conj_t conjx, + conj_t conjy, + dim_t n, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + float* restrict rho ) { /* Just call the reference implementation. */ - bli_sssdotv_unb_var1( conjx, - conjy, - n, - x, incx, - y, incy, - rho ); + BLIS_SDOTV_KERNEL_REF( conjx, + conjy, + n, + x, incx, + y, incy, + rho ); } -void bli_ddddotv_opt_var1( conj_t conjx, - conj_t conjy, - dim_t n, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy, - double* restrict rho ) +void bli_ddotv_opt_var1( conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict rho ) { /* Just call the reference implementation. */ - bli_ddddotv_unb_var1( conjx, - conjy, - n, - x, incx, - y, incy, - rho ); + BLIS_DDOTV_KERNEL_REF( conjx, + conjy, + n, + x, incx, + y, incy, + rho ); } -void bli_cccdotv_opt_var1( conj_t conjx, - conj_t conjy, - dim_t n, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy, - scomplex* restrict rho ) +void bli_cdotv_opt_var1( conj_t conjx, + conj_t conjy, + dim_t n, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy, + scomplex* restrict rho ) { /* Just call the reference implementation. */ - bli_cccdotv_unb_var1( conjx, - conjy, - n, - x, incx, - y, incy, - rho ); + BLIS_CDOTV_KERNEL_REF( conjx, + conjy, + n, + x, incx, + y, incy, + rho ); } -void bli_zzzdotv_opt_var1( conj_t conjx, - conj_t conjy, - dim_t n, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy, - dcomplex* restrict rho ) +void bli_zdotv_opt_var1( conj_t conjx, + conj_t conjy, + dim_t n, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy, + dcomplex* restrict rho ) { /* Template dotv kernel implementation @@ -210,12 +210,12 @@ void bli_zzzdotv_opt_var1( conj_t conjx, // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzdotv_unb_var1( conjx, - conjy, - n, - x, incx, - y, incy, - rho ); + BLIS_ZDOTV_KERNEL_REF( conjx, + conjy, + n, + x, incx, + y, incy, + rho ); return; } @@ -310,36 +310,3 @@ void bli_zzzdotv_opt_var1( conj_t conjx, bli_zzcopys( dotxy, *rho ); } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3 -#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \ -\ -void PASTEMAC3(chx,chy,chr,opname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_r* restrict rho \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(chx,chy,chr,varname)( conjx, \ - conjy, \ - n, \ - x, incx, \ - y, incy, \ - rho ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 ) -#endif diff --git a/config/template/kernels/1f/bli_axpy2v_opt_var1.c b/config/template/kernels/1f/bli_axpy2v_opt_var1.c index a10866a39..3cfb48c40 100644 --- a/config/template/kernels/1f/bli_axpy2v_opt_var1.c +++ b/config/template/kernels/1f/bli_axpy2v_opt_var1.c @@ -36,88 +36,88 @@ -void bli_sssaxpy2v_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - float* restrict alpha1, - float* restrict alpha2, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy, - float* restrict z, inc_t incz - ) +void bli_saxpy2v_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + float* restrict alpha1, + float* restrict alpha2, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + float* restrict z, inc_t incz + ) { /* Just call the reference implementation. */ - bli_sssaxpy2v_unb_var1( conjx, - conjy, - n, - alpha1, - alpha2, - x, incx, - y, incy, - z, incz ); + BLIS_SAXPY2V_KERNEL_REF( conjx, + conjy, + n, + alpha1, + alpha2, + x, incx, + y, incy, + z, incz ); } -void bli_dddaxpy2v_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - double* restrict alpha1, - double* restrict alpha2, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy, - double* restrict z, inc_t incz - ) +void bli_daxpy2v_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict alpha1, + double* restrict alpha2, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict z, inc_t incz + ) { /* Just call the reference implementation. */ - bli_dddaxpy2v_unb_var1( conjx, - conjy, - n, - alpha1, - alpha2, - x, incx, - y, incy, - z, incz ); + BLIS_DAXPY2V_KERNEL_REF( conjx, + conjy, + n, + alpha1, + alpha2, + x, incx, + y, incy, + z, incz ); } -void bli_cccaxpy2v_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - scomplex* restrict alpha1, - scomplex* restrict alpha2, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy, - scomplex* restrict z, inc_t incz - ) +void bli_caxpy2v_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + scomplex* restrict alpha1, + scomplex* restrict alpha2, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy, + scomplex* restrict z, inc_t incz + ) { /* Just call the reference implementation. */ - bli_cccaxpy2v_unb_var1( conjx, - conjy, - n, - alpha1, - alpha2, - x, incx, - y, incy, - z, incz ); + BLIS_CAXPY2V_KERNEL_REF( conjx, + conjy, + n, + alpha1, + alpha2, + x, incx, + y, incy, + z, incz ); } -void bli_zzzaxpy2v_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - dcomplex* restrict alpha1, - dcomplex* restrict alpha2, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy, - dcomplex* restrict z, inc_t incz - ) +void bli_zaxpy2v_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + dcomplex* restrict alpha1, + dcomplex* restrict alpha2, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy, + dcomplex* restrict z, inc_t incz + ) { /* Template axpy2v kernel implementation @@ -229,14 +229,14 @@ void bli_zzzaxpy2v_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzaxpy2v_unb_var1( conjx, - conjy, - n, - alpha1, - alpha2, - x, incx, - y, incy, - z, incz ); + BLIS_ZAXPY2V_KERNEL_REF( conjx, + conjy, + n, + alpha1, + alpha2, + x, incx, + y, incy, + z, incz ); return; } @@ -396,41 +396,3 @@ void bli_zzzaxpy2v_opt_var1( } } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* restrict alpha1, \ - ctype_xy* restrict alpha2, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_z* restrict z, inc_t incz \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(chx,chy,chz,kername)( conjx, \ - conjy, \ - n, \ - alpha1, \ - alpha2, \ - x, incx, \ - y, incy, \ - z, incz ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 ) -#endif - diff --git a/config/template/kernels/1f/bli_axpyf_opt_var1.c b/config/template/kernels/1f/bli_axpyf_opt_var1.c index 28c9bff7a..64b3384bf 100644 --- a/config/template/kernels/1f/bli_axpyf_opt_var1.c +++ b/config/template/kernels/1f/bli_axpyf_opt_var1.c @@ -36,87 +36,87 @@ -void bli_sssaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy - ) +void bli_saxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_sssaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); + BLIS_SAXPYF_KERNEL_REF( conja, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + y, incy ); } -void bli_dddaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy - ) +void bli_daxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_dddaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); + BLIS_DAXPYF_KERNEL_REF( conja, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + y, incy ); } -void bli_cccaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy - ) +void bli_caxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + scomplex* restrict alpha, + scomplex* restrict a, inc_t inca, inc_t lda, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_cccaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); + BLIS_CAXPYF_KERNEL_REF( conja, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + y, incy ); } -void bli_zzzaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy - ) +void bli_zaxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + dcomplex* restrict alpha, + dcomplex* restrict a, inc_t inca, inc_t lda, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy + ) { /* Template axpyf kernel implementation @@ -243,14 +243,14 @@ void bli_zzzaxpyf_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); + BLIS_ZAXPYF_KERNEL_REF( conja, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + y, incy ); return; } @@ -376,41 +376,3 @@ void bli_zzzaxpyf_opt_var1( } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(cha,chx,chy,kername)( conja, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 ) -#endif - diff --git a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c index fc5d3d344..917fb7661 100644 --- a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c +++ b/config/template/kernels/1f/bli_dotaxpyv_opt_var1.c @@ -36,87 +36,87 @@ -void bli_sssdotaxpyv_opt_var1( conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - float* restrict alpha, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy, - float* restrict rho, - float* restrict z, inc_t incz ) +void bli_sdotaxpyv_opt_var1( conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + float* restrict alpha, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy, + float* restrict rho, + float* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_sssdotaxpyv_unb_var1( conjxt, - conjx, - conjy, - n, - alpha, - x, incx, - y, incy, - rho, - z, incz ); + BLIS_SDOTAXPYV_KERNEL_REF( conjxt, + conjx, + conjy, + n, + alpha, + x, incx, + y, incy, + rho, + z, incz ); } -void bli_ddddotaxpyv_opt_var1( conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - double* restrict alpha, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy, - double* restrict rho, - double* restrict z, inc_t incz ) +void bli_ddotaxpyv_opt_var1( conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict alpha, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict rho, + double* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_ddddotaxpyv_unb_var1( conjxt, - conjx, - conjy, - n, - alpha, - x, incx, - y, incy, - rho, - z, incz ); + BLIS_DDOTAXPYV_KERNEL_REF( conjxt, + conjx, + conjy, + n, + alpha, + x, incx, + y, incy, + rho, + z, incz ); } -void bli_cccdotaxpyv_opt_var1( conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - scomplex* restrict alpha, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy, - scomplex* restrict rho, - scomplex* restrict z, inc_t incz ) +void bli_cdotaxpyv_opt_var1( conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + scomplex* restrict alpha, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy, + scomplex* restrict rho, + scomplex* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_cccdotaxpyv_unb_var1( conjxt, - conjx, - conjy, - n, - alpha, - x, incx, - y, incy, - rho, - z, incz ); + BLIS_CDOTAXPYV_KERNEL_REF( conjxt, + conjx, + conjy, + n, + alpha, + x, incx, + y, incy, + rho, + z, incz ); } -void bli_zzzdotaxpyv_opt_var1( conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - dcomplex* restrict alpha, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy, - dcomplex* restrict rho, - dcomplex* restrict z, inc_t incz ) +void bli_zdotaxpyv_opt_var1( conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + dcomplex* restrict alpha, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy, + dcomplex* restrict rho, + dcomplex* restrict z, inc_t incz ) { /* Template dotaxpyv kernel implementation @@ -240,15 +240,15 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt, // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzdotaxpyv_unb_var1( conjxt, - conjx, - conjy, - n, - alpha, - x, incx, - y, incy, - rho, - z, incz ); + BLIS_ZDOTAXPYV_KERNEL_REF( conjxt, + conjx, + conjy, + n, + alpha, + x, incx, + y, incy, + rho, + z, incz ); return; } @@ -429,42 +429,3 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt, bli_zzcopys( dotxy, *rho ); } - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* restrict alpha, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_xy* restrict rho, \ - ctype_z* restrict z, inc_t incz \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(chx,chy,chz,kername)( conjxt, \ - conjx, \ - conjy, \ - n, \ - alpha, \ - x, incx, \ - y, incy, \ - rho, \ - z, incz ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 ) -#endif - diff --git a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c index 0b68f5bb0..8438fd7b7 100644 --- a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c +++ b/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.c @@ -36,115 +36,115 @@ -void bli_sssdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict w, inc_t incw, - float* restrict x, inc_t incx, - float* restrict beta, - float* restrict y, inc_t incy, - float* restrict z, inc_t incz ) +void bli_sdotxaxpyf_opt_var1( conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict w, inc_t incw, + float* restrict x, inc_t incx, + float* restrict beta, + float* restrict y, inc_t incy, + float* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_sssdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); + BLIS_SDOTXAXPYF_KERNEL_REF( conjat, + conja, + conjw, + conjx, + m, + b_n, + alpha, + a, inca, lda, + w, incw, + x, incx, + beta, + y, incy, + z, incz ); } -void bli_ddddotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict w, inc_t incw, - double* restrict x, inc_t incx, - double* restrict beta, - double* restrict y, inc_t incy, - double* restrict z, inc_t incz ) +void bli_ddotxaxpyf_opt_var1( conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict w, inc_t incw, + double* restrict x, inc_t incx, + double* restrict beta, + double* restrict y, inc_t incy, + double* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_ddddotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); + BLIS_DDOTXAXPYF_KERNEL_REF( conjat, + conja, + conjw, + conjx, + m, + b_n, + alpha, + a, inca, lda, + w, incw, + x, incx, + beta, + y, incy, + z, incz ); } -void bli_cccdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict w, inc_t incw, - scomplex* restrict x, inc_t incx, - scomplex* restrict beta, - scomplex* restrict y, inc_t incy, - scomplex* restrict z, inc_t incz ) +void bli_cdotxaxpyf_opt_var1( conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + scomplex* restrict alpha, + scomplex* restrict a, inc_t inca, inc_t lda, + scomplex* restrict w, inc_t incw, + scomplex* restrict x, inc_t incx, + scomplex* restrict beta, + scomplex* restrict y, inc_t incy, + scomplex* restrict z, inc_t incz ) { /* Just call the reference implementation. */ - bli_cccdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); + BLIS_CDOTXAXPYF_KERNEL_REF( conjat, + conja, + conjw, + conjx, + m, + b_n, + alpha, + a, inca, lda, + w, incw, + x, incx, + beta, + y, incy, + z, incz ); } -void bli_zzzdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict w, inc_t incw, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict beta, - dcomplex* restrict y, inc_t incy, - dcomplex* restrict z, inc_t incz ) +void bli_zdotxaxpyf_opt_var1( conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + dcomplex* restrict alpha, + dcomplex* restrict a, inc_t inca, inc_t lda, + dcomplex* restrict w, inc_t incw, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict beta, + dcomplex* restrict y, inc_t incy, + dcomplex* restrict z, inc_t incz ) { /* @@ -289,19 +289,19 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat, // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); + BLIS_ZDOTXAXPYF_KERNEL_REF( conjat, + conja, + conjw, + conjx, + m, + b_n, + alpha, + a, inca, lda, + w, incw, + x, incx, + beta, + y, incy, + z, incz ); return; } @@ -560,51 +560,3 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat, } } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \ -\ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_b* restrict w, inc_t incw, \ - ctype_b* restrict x, inc_t incx, \ - ctype_c* restrict beta, \ - ctype_c* restrict y, inc_t incy, \ - ctype_c* restrict z, inc_t incz \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(cha,chx,chy,kername)( conjat, \ - conja, \ - conjw, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - w, incw, \ - x, incx, \ - beta, \ - y, incy, \ - z, incz ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 ) -#endif - diff --git a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h b/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h deleted file mode 100644 index 8561721a0..000000000 --- a/config/template/kernels/1f/bli_dotxaxpyf_opt_var1.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype kernel interfaces. -// -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \ -\ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_b* restrict w, inc_t incw, \ - ctype_b* restrict x, inc_t incx, \ - ctype_c* restrict beta, \ - ctype_c* restrict y, inc_t incy, \ - ctype_c* restrict z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 ) -#endif - diff --git a/config/template/kernels/1f/bli_dotxf_opt_var1.c b/config/template/kernels/1f/bli_dotxf_opt_var1.c index 88c97cb32..def48d7dc 100644 --- a/config/template/kernels/1f/bli_dotxf_opt_var1.c +++ b/config/template/kernels/1f/bli_dotxf_opt_var1.c @@ -36,95 +36,95 @@ -void bli_sssdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict beta, - float* restrict y, inc_t incy - ) +void bli_sdotxf_opt_var1( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict x, inc_t incx, + float* restrict beta, + float* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_sssdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); + BLIS_SDOTXF_KERNEL_REF( conjat, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy ); } -void bli_ddddotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict beta, - double* restrict y, inc_t incy - ) +void bli_ddotxf_opt_var1( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict beta, + double* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_ddddotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); + BLIS_DDOTXF_KERNEL_REF( conjat, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy ); } -void bli_cccdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict x, inc_t incx, - scomplex* restrict beta, - scomplex* restrict y, inc_t incy - ) +void bli_cdotxf_opt_var1( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + scomplex* restrict alpha, + scomplex* restrict a, inc_t inca, inc_t lda, + scomplex* restrict x, inc_t incx, + scomplex* restrict beta, + scomplex* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_cccdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); + BLIS_CDOTXF_KERNEL_REF( conjat, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy ); } -void bli_zzzdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict beta, - dcomplex* restrict y, inc_t incy - ) +void bli_zdotxf_opt_var1( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + dcomplex* restrict alpha, + dcomplex* restrict a, inc_t inca, inc_t lda, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict beta, + dcomplex* restrict y, inc_t incy + ) { /* Template dotxf kernel implementation @@ -265,15 +265,15 @@ void bli_zzzdotxf_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_zzzdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); + BLIS_ZDOTXF_KERNEL_REF( conjat, + conjx, + m, + b_n, + alpha, + a, inca, lda, + x, incx, + beta, + y, incy ); return; } @@ -414,43 +414,3 @@ void bli_zzzdotxf_opt_var1( } } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_xy* restrict alpha, \ - ctype_x* restrict a, inc_t inca, inc_t lda, \ - ctype_y* restrict x, inc_t incx, \ - ctype_r* restrict beta, \ - ctype_r* restrict y, inc_t incy \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(cha,chx,chy,kername)( conjat, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - beta, \ - y, incy ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 ) -#endif - diff --git a/config/template/kernels/3/bli_gemm_opt_mxn.c b/config/template/kernels/3/bli_gemm_opt_mxn.c index 7cc7f8e1b..b32227fff 100644 --- a/config/template/kernels/3/bli_gemm_opt_mxn.c +++ b/config/template/kernels/3/bli_gemm_opt_mxn.c @@ -47,7 +47,7 @@ void bli_sgemm_opt_mxn( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a1, b1, @@ -162,8 +162,7 @@ void bli_dgemm_opt_mxn( that exist (at the edges) is handled automatically within the macro-kernel. - Alignment of a1 and b1. The addresses a1 and b1 are aligned according - to the alignment value BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the - bli_config.h header file of the BLIS configuration. + to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively. - Unrolling loops. As a general rule of thumb, the loop over k is sometimes moderately unrolled; for example, in our experience, an unrolling factor of u = 4 is fairly common. If unrolling is applied @@ -275,7 +274,7 @@ void bli_cgemm_opt_mxn( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a1, b1, @@ -297,7 +296,7 @@ void bli_zgemm_opt_mxn( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a1, b1, diff --git a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c index dcdfb8475..550a42f79 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.c @@ -166,16 +166,8 @@ void bli_dgemmtrsm_l_opt_mxn( - Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation Notes for gemm. - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a1 and b1. Unlike with gemm, the addresses a10/a12 and a11 - are not guaranteed to be aligned according to the alignment value - BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the bli_config.h header - file. This is because these micro-panels may vary in size due to the - triangular nature of matrix A. Instead, these addresses are aligned - to PACKMR x sizeof(type), where type is the datatype in question. To - support a somewhat obscure, higher-level optimization, we similarly - do not guarantee that b01/b21 and b11 are aligned to - BLIS_CONTIG_STRIDE_ALIGN_SIZE; instead, they are only aligned to - PACKNR x sizeof(type). + - Alignment of a1 and b1. The addresses a1 and b1 are aligned according + to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively. - Unrolling loops. Most optimized implementations should unroll all three loops within the trsm subproblem of gemmtrsm. See Implementation Notes for gemm for remarks on unrolling the gemm subproblem. diff --git a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c index fe9773186..2ad6a81ad 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c +++ b/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.c @@ -164,16 +164,8 @@ void bli_dgemmtrsm_u_opt_mxn( - Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation Notes for gemm. - Edge cases in MR, NR dimensions. See Implementation Notes for gemm. - - Alignment of a1 and b1. Unlike with gemm, the addresses a10/a12 and a11 - are not guaranteed to be aligned according to the alignment value - BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the bli_config.h header - file. This is because these micro-panels may vary in size due to the - triangular nature of matrix A. Instead, these addresses are aligned - to PACKMR x sizeof(type), where type is the datatype in question. To - support a somewhat obscure, higher-level optimization, we similarly - do not guarantee that b01/b21 and b11 are aligned to - BLIS_CONTIG_STRIDE_ALIGN_SIZE; instead, they are only aligned to - PACKNR x sizeof(type). + - Alignment of a1 and b1. The addresses a1 and b1 are aligned according + to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively. - Unrolling loops. Most optimized implementations should unroll all three loops within the trsm subproblem of gemmtrsm. See Implementation Notes for gemm for remarks on unrolling the gemm subproblem. diff --git a/config/template/kernels/3/bli_trsm_l_opt_mxn.c b/config/template/kernels/3/bli_trsm_l_opt_mxn.c index 85a906f85..1fd8126f4 100644 --- a/config/template/kernels/3/bli_trsm_l_opt_mxn.c +++ b/config/template/kernels/3/bli_trsm_l_opt_mxn.c @@ -44,7 +44,7 @@ void bli_strsm_l_opt_mxn( ) { /* Just call the reference implementation. */ - bli_strsm_l_ref_mxn( a11, + BLIS_STRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); @@ -216,7 +216,7 @@ void bli_ctrsm_l_opt_mxn( ) { /* Just call the reference implementation. */ - bli_ctrsm_l_ref_mxn( a11, + BLIS_CTRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); @@ -232,7 +232,7 @@ void bli_ztrsm_l_opt_mxn( ) { /* Just call the reference implementation. */ - bli_ztrsm_l_ref_mxn( a11, + BLIS_ZTRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); diff --git a/config/template/kernels/3/bli_trsm_u_opt_mxn.c b/config/template/kernels/3/bli_trsm_u_opt_mxn.c index ee9f2c168..4c7b04415 100644 --- a/config/template/kernels/3/bli_trsm_u_opt_mxn.c +++ b/config/template/kernels/3/bli_trsm_u_opt_mxn.c @@ -37,25 +37,25 @@ void bli_strsm_u_opt_mxn( - float* restrict a, - float* restrict b, - float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict a11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* data ) { /* Just call the reference implementation. */ - bli_strsm_u_ref_mxn( a, - b, - c, rs_c, cs_c, + BLIS_STRSM_U_UKERNEL_REF( a11, + b11, + c11, rs_c, cs_c, data ); } void bli_dtrsm_u_opt_mxn( - double* restrict a, - double* restrict b, - double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict a11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* data ) { @@ -168,18 +168,18 @@ void bli_dtrsm_u_opt_mxn( { i = m - iter - 1; n_behind = iter; - alpha11 = a + (i )*rs_a + (i )*cs_a; - a12t = a + (i )*rs_a + (i+1)*cs_a; - x1 = b + (i )*rs_b + (0 )*cs_b; - X2 = b + (i+1)*rs_b + (0 )*cs_b; + alpha11 = a11 + (i )*rs_a + (i )*cs_a; + a12t = a11 + (i )*rs_a + (i+1)*cs_a; + x1 = b11 + (i )*rs_b + (0 )*cs_b; + X2 = b11 + (i+1)*rs_b + (0 )*cs_b; /* x1 = x1 - a12t * X2; */ /* x1 = x1 / alpha11; */ for ( j = 0; j < n; ++j ) { - chi11 = x1 + (0 )*rs_b + (j )*cs_b; - x21 = X2 + (0 )*rs_b + (j )*cs_b; - gamma11 = c + (i )*rs_c + (j )*cs_c; + chi11 = x1 + (0 )*rs_b + (j )*cs_b; + x21 = X2 + (0 )*rs_b + (j )*cs_b; + gamma11 = c11 + (i )*rs_c + (j )*cs_c; /* chi11 = chi11 - a12t * x21; */ bli_dset0s( rho11 ); @@ -208,32 +208,32 @@ void bli_dtrsm_u_opt_mxn( void bli_ctrsm_u_opt_mxn( - scomplex* restrict a, - scomplex* restrict b, - scomplex* restrict c, inc_t rs_c, inc_t cs_c, + scomplex* restrict a11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* data ) { /* Just call the reference implementation. */ - bli_ctrsm_u_ref_mxn( a, - b, - c, rs_c, cs_c, + BLIS_CTRSM_U_UKERNEL_REF( a11, + b11, + c11, rs_c, cs_c, data ); } void bli_ztrsm_u_opt_mxn( - dcomplex* restrict a, - dcomplex* restrict b, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + dcomplex* restrict a11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* data ) { /* Just call the reference implementation. */ - bli_ztrsm_u_ref_mxn( a, - b, - c, rs_c, cs_c, + BLIS_ZTRSM_U_UKERNEL_REF( a11, + b11, + c11, rs_c, cs_c, data ); } diff --git a/frame/1/addv/bli_addv.c b/frame/1/addv/bli_addv.c index 386b3e723..8056a71ae 100644 --- a/frame/1/addv/bli_addv.c +++ b/frame/1/addv/bli_addv.c @@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( addv, ADDV_KERNEL ) +GENFRONT( addv, addv_kernel ) // diff --git a/frame/1/addv/bli_addv.h b/frame/1/addv/bli_addv.h index 4e389cfb4..8577ada71 100644 --- a/frame/1/addv/bli_addv.h +++ b/frame/1/addv/bli_addv.h @@ -33,7 +33,8 @@ */ #include "bli_addv_check.h" -#include "bli_addv_unb_var1.h" +#include "bli_addv_kernel.h" +#include "bli_addv_ref.h" // diff --git a/frame/1/addv/bli_addv_kernel.c b/frame/1/addv/bli_addv_kernel.c new file mode 100644 index 000000000..539215ac4 --- /dev/null +++ b/frame/1/addv/bli_addv_kernel.c @@ -0,0 +1,115 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T addv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + dim_t n, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_kernel_void); +#endif +#endif + + +void bli_addv_kernel( obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y]; + + // Invoke the function. + f( conjx, + n, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \ +\ +void PASTEMAC2(chx,chy,varname)( \ + conj_t conjx, \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC2(chx,chy,kername)( conjx, \ + n, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( addv_kernel_void, ADDV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( addv_kernel_void, ADDV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( addv_kernel_void, ADDV_KERNEL ) +#endif + diff --git a/frame/1/copyv/bli_copyv_unb_var1.h b/frame/1/addv/bli_addv_kernel.h similarity index 89% rename from frame/1/copyv/bli_copyv_unb_var1.h rename to frame/1/addv/bli_addv_kernel.h index 1239d5804..b7ee0336e 100644 --- a/frame/1/copyv/bli_copyv_unb_var1.h +++ b/frame/1/addv/bli_addv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_copyv_unb_var1( obj_t* x, - obj_t* y ); +void bli_addv_kernel( obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ @@ -46,12 +50,13 @@ void PASTEMAC2(chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT2_BASIC( copyv_unb_var1 ) +INSERT_GENTPROT2_BASIC( addv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( copyv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( addv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( copyv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( addv_kernel_void ) #endif + diff --git a/frame/1/addv/bli_addv_unb_var1.c b/frame/1/addv/bli_addv_ref.c similarity index 85% rename from frame/1/addv/bli_addv_unb_var1.c rename to frame/1/addv/bli_addv_ref.c index e7caebc46..9107bcb20 100644 --- a/frame/1/addv/bli_addv_unb_var1.c +++ b/frame/1/addv/bli_addv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T addv_fp typedef void (*FUNCPTR_T)( @@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_ref); #endif #endif -void bli_addv_unb_var1( obj_t* x, +void bli_addv_ref( obj_t* x, obj_t* y ) { num_t dt_x = bli_obj_datatype( *x ); @@ -83,17 +84,19 @@ void bli_addv_unb_var1( obj_t* x, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \ \ -void PASTEMAC2(chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_x* x_cast = x; \ ctype_y* y_cast = y; \ @@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( addv, addv_unb_var1 ) +INSERT_GENTFUNC2_BASIC( addv, addv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( addv, addv_unb_var1 ) +INSERT_GENTFUNC2_MIX_D( addv, addv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( addv, addv_unb_var1 ) +INSERT_GENTFUNC2_MIX_P( addv, addv_ref ) #endif diff --git a/frame/1/addv/bli_addv_ref.h b/frame/1/addv/bli_addv_ref.h new file mode 100644 index 000000000..b39c6cfd9 --- /dev/null +++ b/frame/1/addv/bli_addv_ref.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_addv_ref( obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ +\ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( addv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( addv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( addv_ref ) +#endif diff --git a/frame/1/axpyv/bli_axpyv.c b/frame/1/axpyv/bli_axpyv.c index f0439f8d8..1fb4d8243 100644 --- a/frame/1/axpyv/bli_axpyv.c +++ b/frame/1/axpyv/bli_axpyv.c @@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( axpyv, AXPYV_KERNEL ) +GENFRONT( axpyv, axpyv_kernel ) // diff --git a/frame/1/axpyv/bli_axpyv.h b/frame/1/axpyv/bli_axpyv.h index fb1ef2985..4e1abe4b3 100644 --- a/frame/1/axpyv/bli_axpyv.h +++ b/frame/1/axpyv/bli_axpyv.h @@ -33,7 +33,8 @@ */ #include "bli_axpyv_check.h" -#include "bli_axpyv_unb_var1.h" +#include "bli_axpyv_kernel.h" +#include "bli_axpyv_ref.h" // diff --git a/frame/1/axpyv/bli_axpyv_kernel.c b/frame/1/axpyv/bli_axpyv_kernel.c new file mode 100644 index 000000000..3ccb262f9 --- /dev/null +++ b/frame/1/axpyv/bli_axpyv_kernel.c @@ -0,0 +1,128 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T axpyv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + dim_t n, + void* alpha, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_kernel_void); +#endif +#endif + + +void bli_axpyv_kernel( obj_t* alpha, + obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + num_t dt_alpha; + void* buf_alpha; + + FUNCPTR_T f; + + // If alpha is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the alpha object and extract the buffer at the alpha offset. + bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_alpha][dt_x][dt_y]; + + // Invoke the function. + f( conjx, + n, + buf_alpha, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC3 +#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,varname)( \ + conj_t conjx, \ + dim_t n, \ + void* alpha, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC3(cha,chx,chy,kername)( conjx, \ + n, \ + alpha, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3_BASIC( axpyv_kernel_void, AXPYV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3_MIX_D( axpyv_kernel_void, AXPYV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3_MIX_P( axpyv_kernel_void, AXPYV_KERNEL ) +#endif + diff --git a/frame/1/axpyv/bli_axpyv_unb_var1.h b/frame/1/axpyv/bli_axpyv_kernel.h similarity index 88% rename from frame/1/axpyv/bli_axpyv_unb_var1.h rename to frame/1/axpyv/bli_axpyv_kernel.h index b88ebb6f9..b75611967 100644 --- a/frame/1/axpyv/bli_axpyv_unb_var1.h +++ b/frame/1/axpyv/bli_axpyv_kernel.h @@ -32,11 +32,15 @@ */ -void bli_axpyv_unb_var1( obj_t* alpha, - obj_t* x, - obj_t* y ); +void bli_axpyv_kernel( obj_t* alpha, + obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3 #define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \ \ @@ -48,13 +52,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT3_BASIC( axpyv_unb_var1 ) +INSERT_GENTPROT3_BASIC( axpyv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpyv_unb_var1 ) +INSERT_GENTPROT3_MIX_D( axpyv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpyv_unb_var1 ) +INSERT_GENTPROT3_MIX_P( axpyv_kernel_void ) #endif diff --git a/frame/1/axpyv/bli_axpyv_unb_var1.c b/frame/1/axpyv/bli_axpyv_ref.c similarity index 86% rename from frame/1/axpyv/bli_axpyv_unb_var1.c rename to frame/1/axpyv/bli_axpyv_ref.c index e21744b78..67323e223 100644 --- a/frame/1/axpyv/bli_axpyv_unb_var1.c +++ b/frame/1/axpyv/bli_axpyv_ref.c @@ -33,7 +33,7 @@ */ #include "blis.h" - +/* #define FUNCPTR_T axpyv_fp typedef void (*FUNCPTR_T)( @@ -47,17 +47,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_ref); #endif #endif -void bli_axpyv_unb_var1( obj_t* alpha, +void bli_axpyv_ref( obj_t* alpha, obj_t* x, obj_t* y ) { @@ -94,18 +94,19 @@ void bli_axpyv_unb_var1( obj_t* alpha, buf_x, inc_x, buf_y, inc_y ); } - +*/ #undef GENTFUNC3 #define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, addvker ) \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_a* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_a* alpha_cast = alpha; \ ctype_x* x_cast = x; \ @@ -156,13 +157,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3_BASIC( axpyv_unb_var1, ADDV_KERNEL ) +INSERT_GENTFUNC3_BASIC( axpyv_ref, ADDV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( axpyv_unb_var1, ADDV_KERNEL ) +INSERT_GENTFUNC3_MIX_D( axpyv_ref, ADDV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( axpyv_unb_var1, ADDV_KERNEL ) +INSERT_GENTFUNC3_MIX_P( axpyv_ref, ADDV_KERNEL ) #endif diff --git a/config/template/kernels/1/bli_axpyv_opt_var1.h b/frame/1/axpyv/bli_axpyv_ref.h similarity index 76% rename from config/template/kernels/1/bli_axpyv_opt_var1.h rename to frame/1/axpyv/bli_axpyv_ref.h index deb52fa75..2dfe64ada 100644 --- a/config/template/kernels/1/bli_axpyv_opt_var1.h +++ b/frame/1/axpyv/bli_axpyv_ref.h @@ -32,28 +32,32 @@ */ +/* +void bli_axpyv_ref( obj_t* alpha, + obj_t* x, + obj_t* y ); +*/ + -// -// Prototype axpyv kernel interfaces. -// #undef GENTPROT3 #define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - ctype_a* restrict alpha, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ); +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_a* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); -INSERT_GENTPROT3_BASIC( axpyv_opt_var1 ) +INSERT_GENTPROT3_BASIC( axpyv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 ) +INSERT_GENTPROT3_MIX_D( axpyv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 ) +INSERT_GENTPROT3_MIX_P( axpyv_ref ) #endif diff --git a/frame/1/copyv/bli_copyv.c b/frame/1/copyv/bli_copyv.c index 870f766a8..1f29e1b30 100644 --- a/frame/1/copyv/bli_copyv.c +++ b/frame/1/copyv/bli_copyv.c @@ -34,16 +34,6 @@ #include "blis.h" -/* -void bli_copyv( obj_t* x, - obj_t* y ) -{ - if ( bli_error_checking_is_enabled() ) - bli_copyv_check( x, y ); - - bli_copyv_unb_var1( x, y ); -} -*/ // // Define object-based interface. @@ -63,7 +53,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( copyv, COPYV_KERNEL ) +GENFRONT( copyv, copyv_kernel ) // diff --git a/frame/1/copyv/bli_copyv.h b/frame/1/copyv/bli_copyv.h index 4f0f79f90..baef1b9c8 100644 --- a/frame/1/copyv/bli_copyv.h +++ b/frame/1/copyv/bli_copyv.h @@ -33,7 +33,8 @@ */ #include "bli_copyv_check.h" -#include "bli_copyv_unb_var1.h" +#include "bli_copyv_kernel.h" +#include "bli_copyv_ref.h" // diff --git a/frame/1/copyv/bli_copyv_kernel.c b/frame/1/copyv/bli_copyv_kernel.c new file mode 100644 index 000000000..40cf15f0c --- /dev/null +++ b/frame/1/copyv/bli_copyv_kernel.c @@ -0,0 +1,115 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T copyv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + dim_t n, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_kernel_void); +#endif +#endif + + +void bli_copyv_kernel( obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y]; + + // Invoke the function. + f( conjx, + n, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \ +\ +void PASTEMAC2(chx,chy,varname)( \ + conj_t conjx, \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC2(chx,chy,kername)( conjx, \ + n, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( copyv_kernel_void, COPYV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( copyv_kernel_void, COPYV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( copyv_kernel_void, COPYV_KERNEL ) +#endif + diff --git a/frame/1/addv/bli_addv_unb_var1.h b/frame/1/copyv/bli_copyv_kernel.h similarity index 89% rename from frame/1/addv/bli_addv_unb_var1.h rename to frame/1/copyv/bli_copyv_kernel.h index 65b75d858..e0420253a 100644 --- a/frame/1/addv/bli_addv_unb_var1.h +++ b/frame/1/copyv/bli_copyv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_addv_unb_var1( obj_t* x, - obj_t* y ); +void bli_copyv_kernel( obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ @@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT2_BASIC( addv_unb_var1 ) +INSERT_GENTPROT2_BASIC( copyv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( addv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( copyv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( addv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( copyv_kernel_void ) #endif diff --git a/frame/1/copyv/bli_copyv_unb_var1.c b/frame/1/copyv/bli_copyv_ref.c similarity index 85% rename from frame/1/copyv/bli_copyv_unb_var1.c rename to frame/1/copyv/bli_copyv_ref.c index ed2cbedb1..21def27a6 100644 --- a/frame/1/copyv/bli_copyv_unb_var1.c +++ b/frame/1/copyv/bli_copyv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T copyv_fp typedef void (*FUNCPTR_T)( @@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_ref); #endif #endif -void bli_copyv_unb_var1( obj_t* x, +void bli_copyv_ref( obj_t* x, obj_t* y ) { num_t dt_x = bli_obj_datatype( *x ); @@ -83,17 +84,19 @@ void bli_copyv_unb_var1( obj_t* x, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \ \ -void PASTEMAC2(chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_x* x_cast = x; \ ctype_y* y_cast = y; \ @@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 ) +INSERT_GENTFUNC2_BASIC( copyv, copyv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 ) +INSERT_GENTFUNC2_MIX_D( copyv, copyv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 ) +INSERT_GENTFUNC2_MIX_P( copyv, copyv_ref ) #endif diff --git a/frame/1/copyv/bli_copyv_ref.h b/frame/1/copyv/bli_copyv_ref.h new file mode 100644 index 000000000..0ede9aeea --- /dev/null +++ b/frame/1/copyv/bli_copyv_ref.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_copyv_ref( obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ +\ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( copyv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( copyv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( copyv_ref ) +#endif diff --git a/frame/1/dotv/bli_dotv.c b/frame/1/dotv/bli_dotv.c index e285de1e7..07772d80f 100644 --- a/frame/1/dotv/bli_dotv.c +++ b/frame/1/dotv/bli_dotv.c @@ -55,7 +55,7 @@ void PASTEMAC0(opname)( \ rho ); \ } -GENFRONT( dotv, DOTV_KERNEL ) +GENFRONT( dotv, dotv_kernel ) // diff --git a/frame/1/dotv/bli_dotv.h b/frame/1/dotv/bli_dotv.h index e5a4ce75b..7a0ea8265 100644 --- a/frame/1/dotv/bli_dotv.h +++ b/frame/1/dotv/bli_dotv.h @@ -33,7 +33,8 @@ */ #include "bli_dotv_check.h" -#include "bli_dotv_unb_var1.h" +#include "bli_dotv_kernel.h" +#include "bli_dotv_ref.h" // diff --git a/frame/1/dotv/bli_dotv_kernel.c b/frame/1/dotv/bli_dotv_kernel.c new file mode 100644 index 000000000..9d7de4bbf --- /dev/null +++ b/frame/1/dotv/bli_dotv_kernel.c @@ -0,0 +1,128 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T dotv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + conj_t conjy, + dim_t n, + void* x, inc_t incx, + void* y, inc_t incy, + void* rho + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_kernel_void); +#endif +#endif + + +void bli_dotv_kernel( obj_t* x, + obj_t* y, + obj_t* rho ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + num_t dt_rho = bli_obj_datatype( *rho ); + + conj_t conjx = bli_obj_conj_status( *x ); + conj_t conjy = bli_obj_conj_status( *y ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + void* buf_rho = bli_obj_buffer_at_off( *rho ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y][dt_rho]; + + // Invoke the function. + f( conjx, + conjy, + n, + buf_x, inc_x, + buf_y, inc_y, + buf_rho ); +} + + +#undef GENTFUNC3 +#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname, kername ) \ +\ +void PASTEMAC3(chx,chy,chr,varname)( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy, \ + void* rho \ + ) \ +{ \ + PASTEMAC3(chx,chy,chr,kername)( conjx, \ + conjy, \ + n, \ + x, incx, \ + y, incy, \ + rho ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3_BASIC( dotv_kernel_void, DOTV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3_MIX_D( dotv_kernel_void, DOTV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3_MIX_P( dotv_kernel_void, DOTV_KERNEL ) +#endif + diff --git a/frame/1/dotv/bli_dotv_unb_var1.h b/frame/1/dotv/bli_dotv_kernel.h similarity index 88% rename from frame/1/dotv/bli_dotv_unb_var1.h rename to frame/1/dotv/bli_dotv_kernel.h index 59def0502..78b77bc33 100644 --- a/frame/1/dotv/bli_dotv_unb_var1.h +++ b/frame/1/dotv/bli_dotv_kernel.h @@ -32,11 +32,15 @@ */ -void bli_dotv_unb_var1( obj_t* x, - obj_t* y, - obj_t* rho ); +void bli_dotv_kernel( obj_t* x, + obj_t* y, + obj_t* rho ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3 #define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \ \ @@ -49,13 +53,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \ void* rho \ ); -INSERT_GENTPROT3_BASIC( dotv_unb_var1 ) +INSERT_GENTPROT3_BASIC( dotv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( dotv_unb_var1 ) +INSERT_GENTPROT3_MIX_D( dotv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( dotv_unb_var1 ) +INSERT_GENTPROT3_MIX_P( dotv_kernel_void ) #endif diff --git a/frame/1/dotv/bli_dotv_unb_var1.c b/frame/1/dotv/bli_dotv_ref.c similarity index 85% rename from frame/1/dotv/bli_dotv_unb_var1.c rename to frame/1/dotv/bli_dotv_ref.c index ef1c5a451..cd7baaba3 100644 --- a/frame/1/dotv/bli_dotv_unb_var1.c +++ b/frame/1/dotv/bli_dotv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotv_fp typedef void (*FUNCPTR_T)( @@ -48,17 +49,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_ref); #endif #endif -void bli_dotv_unb_var1( obj_t* x, +void bli_dotv_ref( obj_t* x, obj_t* y, obj_t* rho ) { @@ -92,19 +93,20 @@ void bli_dotv_unb_var1( obj_t* x, buf_y, inc_y, buf_rho ); } - +*/ #undef GENTFUNC3 #define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \ \ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho \ - ) \ +void PASTEMAC3(chx,chy,chr,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict rho \ + ) \ { \ ctype_x* x_cast = x; \ ctype_y* y_cast = y; \ @@ -163,13 +165,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3_BASIC( dotv, dotv_unb_var1 ) +INSERT_GENTFUNC3_BASIC( dotv, dotv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( dotv, dotv_unb_var1 ) +INSERT_GENTFUNC3_MIX_D( dotv, dotv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( dotv, dotv_unb_var1 ) +INSERT_GENTFUNC3_MIX_P( dotv, dotv_ref ) #endif diff --git a/frame/1/dotv/bli_dotv_ref.h b/frame/1/dotv/bli_dotv_ref.h new file mode 100644 index 000000000..c5fcef764 --- /dev/null +++ b/frame/1/dotv/bli_dotv_ref.h @@ -0,0 +1,64 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_dotv_ref( obj_t* x, + obj_t* y, + obj_t* rho ); +*/ + + +#undef GENTPROT3 +#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \ +\ +void PASTEMAC3(chx,chy,chr,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict rho \ + ); + +INSERT_GENTPROT3_BASIC( dotv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT3_MIX_D( dotv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT3_MIX_P( dotv_ref ) +#endif + diff --git a/frame/1/dotxv/bli_dotxv.c b/frame/1/dotxv/bli_dotxv.c index a39be7628..5d30ff546 100644 --- a/frame/1/dotxv/bli_dotxv.c +++ b/frame/1/dotxv/bli_dotxv.c @@ -59,7 +59,7 @@ void PASTEMAC0(opname)( \ rho ); \ } -GENFRONT( dotxv, DOTXV_KERNEL ) +GENFRONT( dotxv, dotxv_kernel ) // diff --git a/frame/1/dotxv/bli_dotxv.h b/frame/1/dotxv/bli_dotxv.h index cafdf98b2..e2d8a9521 100644 --- a/frame/1/dotxv/bli_dotxv.h +++ b/frame/1/dotxv/bli_dotxv.h @@ -33,7 +33,8 @@ */ #include "bli_dotxv_check.h" -#include "bli_dotxv_unb_var1.h" +#include "bli_dotxv_kernel.h" +#include "bli_dotxv_ref.h" // diff --git a/frame/1/dotxv/bli_dotxv_kernel.c b/frame/1/dotxv/bli_dotxv_kernel.c new file mode 100644 index 000000000..0df32fce9 --- /dev/null +++ b/frame/1/dotxv/bli_dotxv_kernel.c @@ -0,0 +1,153 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T dotxv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + conj_t conjy, + dim_t n, + void* alpha, + void* x, inc_t incx, + void* y, inc_t incy, + void* beta, + void* rho + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_kernel_void); +#endif +#endif + + +void bli_dotxv_kernel( obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* beta, + obj_t* rho ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + num_t dt_rho = bli_obj_datatype( *rho ); + + conj_t conjx = bli_obj_conj_status( *x ); + conj_t conjy = bli_obj_conj_status( *y ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + void* buf_rho = bli_obj_buffer_at_off( *rho ); + + num_t dt_alpha; + void* buf_alpha; + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + + // The datatype of alpha MUST be the type union of x and y. This is to + // prevent any unnecessary loss of information during computation. + dt_alpha = bli_datatype_union( dt_x, dt_y ); + buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); + + // The datatype of beta MUST be the same as the datatype of rho. + dt_beta = dt_rho; + buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y][dt_rho]; + + // Invoke the function. + f( conjx, + conjy, + n, + buf_alpha, + buf_x, inc_x, + buf_y, inc_y, + buf_beta, + buf_rho ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \ +\ +void PASTEMAC3(chx,chy,chr,varname)( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + void* alpha, \ + void* x, inc_t incx, \ + void* y, inc_t incy, \ + void* beta, \ + void* rho \ + ) \ +{ \ + PASTEMAC3(chx,chy,chr,kername)( conjx, \ + conjy, \ + n, \ + alpha, \ + x, incx, \ + y, incy, \ + beta, \ + rho ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( dotxv_kernel_void, DOTXV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( dotxv_kernel_void, DOTXV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( dotxv_kernel_void, DOTXV_KERNEL ) +#endif + diff --git a/frame/1/dotxv/bli_dotxv_unb_var1.h b/frame/1/dotxv/bli_dotxv_kernel.h similarity index 86% rename from frame/1/dotxv/bli_dotxv_unb_var1.h rename to frame/1/dotxv/bli_dotxv_kernel.h index f91623f28..e2fd97e03 100644 --- a/frame/1/dotxv/bli_dotxv_unb_var1.h +++ b/frame/1/dotxv/bli_dotxv_kernel.h @@ -32,13 +32,17 @@ */ -void bli_dotxv_unb_var1( obj_t* alpha, - obj_t* x, - obj_t* y, - obj_t* beta, - obj_t* rho ); +void bli_dotxv_kernel( obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* beta, + obj_t* rho ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \ \ @@ -53,13 +57,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \ void* rho \ ); -INSERT_GENTPROT3U12_BASIC( dotxv_unb_var1 ) +INSERT_GENTPROT3U12_BASIC( dotxv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxv_unb_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotxv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxv_unb_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotxv_kernel_void ) #endif diff --git a/frame/1/dotxv/bli_dotxv_unb_var1.c b/frame/1/dotxv/bli_dotxv_ref.c similarity index 86% rename from frame/1/dotxv/bli_dotxv_unb_var1.c rename to frame/1/dotxv/bli_dotxv_ref.c index 30582111c..c5e0a78f2 100644 --- a/frame/1/dotxv/bli_dotxv_unb_var1.c +++ b/frame/1/dotxv/bli_dotxv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotxv_fp typedef void (*FUNCPTR_T)( @@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_ref); #endif #endif -void bli_dotxv_unb_var1( obj_t* alpha, +void bli_dotxv_ref( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, @@ -113,21 +114,23 @@ void bli_dotxv_unb_var1( obj_t* alpha, buf_beta, buf_rho ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \ \ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* beta, \ - void* rho \ - ) \ +void PASTEMAC3(chx,chy,chr,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict beta, \ + ctype_r* restrict rho \ + ) \ { \ ctype_xy* alpha_cast = alpha; \ ctype_x* x_cast = x; \ @@ -194,13 +197,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_unb_var1 ) +INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_unb_var1 ) +INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_unb_var1 ) +INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_ref ) #endif diff --git a/frame/1/dotxv/bli_dotxv_ref.h b/frame/1/dotxv/bli_dotxv_ref.h new file mode 100644 index 000000000..59e06f43d --- /dev/null +++ b/frame/1/dotxv/bli_dotxv_ref.h @@ -0,0 +1,68 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_dotxv_ref( obj_t* alpha, + obj_t* x, + obj_t* y, + obj_t* beta, + obj_t* rho ); +*/ + + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \ +\ +void PASTEMAC3(chx,chy,chr,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict beta, \ + ctype_r* restrict rho \ + ); + +INSERT_GENTPROT3U12_BASIC( dotxv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT3U12_MIX_D( dotxv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT3U12_MIX_P( dotxv_ref ) +#endif + diff --git a/frame/1/invertv/bli_invertv.c b/frame/1/invertv/bli_invertv.c index 5ea337654..d43a4fa93 100644 --- a/frame/1/invertv/bli_invertv.c +++ b/frame/1/invertv/bli_invertv.c @@ -34,15 +34,6 @@ #include "blis.h" -/* -void bli_invertv( obj_t* x ) -{ - if ( bli_error_checking_is_enabled() ) - bli_invertv_check( x ); - - bli_invertv_unb_var1( x ); -} -*/ // // Define object-based interface. @@ -60,7 +51,7 @@ void PASTEMAC0(opname)( \ PASTEMAC0(varname)( x ); \ } -GENFRONT( invertv, INVERTV_KERNEL ) +GENFRONT( invertv, invertv_kernel ) // diff --git a/frame/1/invertv/bli_invertv.h b/frame/1/invertv/bli_invertv.h index 5768d9707..c70a630d3 100644 --- a/frame/1/invertv/bli_invertv.h +++ b/frame/1/invertv/bli_invertv.h @@ -33,7 +33,8 @@ */ #include "bli_invertv_check.h" -#include "bli_invertv_unb_var1.h" +#include "bli_invertv_kernel.h" +#include "bli_invertv_ref.h" // diff --git a/frame/1/invertv/bli_invertv_kernel.c b/frame/1/invertv/bli_invertv_kernel.c new file mode 100644 index 000000000..55f8eb186 --- /dev/null +++ b/frame/1/invertv/bli_invertv_kernel.c @@ -0,0 +1,81 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T invertv_fp + +typedef void (*FUNCPTR_T)( + dim_t n, + void* x, inc_t incx + ); + +static FUNCPTR_T GENARRAY(ftypes,invertv_kernel_void); + + +void bli_invertv_kernel( obj_t* x ) +{ + num_t dt_x = bli_obj_datatype( *x ); + + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x]; + + // Invoke the function. + f( n, + buf_x, inc_x ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, kername ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t n, \ + void* x, inc_t incx \ + ) \ +{ \ + PASTEMAC(ch,kername)( n, \ + x, incx ); \ +} + +INSERT_GENTFUNC_BASIC( invertv_kernel_void, INVERTV_KERNEL ) + diff --git a/frame/1/invertv/bli_invertv_unb_var1.h b/frame/1/invertv/bli_invertv_kernel.h similarity index 93% rename from frame/1/invertv/bli_invertv_unb_var1.h rename to frame/1/invertv/bli_invertv_kernel.h index 664f06122..aeb594350 100644 --- a/frame/1/invertv/bli_invertv_unb_var1.h +++ b/frame/1/invertv/bli_invertv_kernel.h @@ -32,9 +32,13 @@ */ -void bli_invertv_unb_var1( obj_t* x ); +void bli_invertv_kernel( obj_t* x ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ @@ -43,5 +47,5 @@ void PASTEMAC(ch,varname)( \ void* x, inc_t incx \ ); -INSERT_GENTPROT_BASIC( invertv_unb_var1 ) +INSERT_GENTPROT_BASIC( invertv_kernel_void ) diff --git a/frame/1/invertv/bli_invertv_unb_var1.c b/frame/1/invertv/bli_invertv_ref.c similarity index 89% rename from frame/1/invertv/bli_invertv_unb_var1.c rename to frame/1/invertv/bli_invertv_ref.c index 2e17fdfd8..024edfdc6 100644 --- a/frame/1/invertv/bli_invertv_unb_var1.c +++ b/frame/1/invertv/bli_invertv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T invertv_fp typedef void (*FUNCPTR_T)( @@ -41,10 +42,10 @@ typedef void (*FUNCPTR_T)( void* x, inc_t incx ); -static FUNCPTR_T GENARRAY(ftypes,invertv_unb_var1); +static FUNCPTR_T GENARRAY(ftypes,invertv_ref); -void bli_invertv_unb_var1( obj_t* x ) +void bli_invertv_ref( obj_t* x ) { num_t dt_x = bli_obj_datatype( *x ); @@ -63,15 +64,17 @@ void bli_invertv_unb_var1( obj_t* x ) f( n, buf_x, inc_x ); } +*/ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, varname ) \ \ -void PASTEMAC(ch,varname)( \ - dim_t n, \ - void* x, inc_t incx \ - ) \ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx \ + ) \ { \ ctype* x_cast = x; \ ctype* chi1; \ @@ -89,5 +92,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( invertv, invertv_unb_var1 ) +INSERT_GENTFUNC_BASIC( invertv, invertv_ref ) diff --git a/kernels/c99/3/bli_trsm_l_ref_4x4.h b/frame/1/invertv/bli_invertv_ref.h similarity index 84% rename from kernels/c99/3/bli_trsm_l_ref_4x4.h rename to frame/1/invertv/bli_invertv_ref.h index 0cb5702b5..df87f92bf 100644 --- a/kernels/c99/3/bli_trsm_l_ref_4x4.h +++ b/frame/1/invertv/bli_invertv_ref.h @@ -32,16 +32,19 @@ */ +/* +void bli_invertv_ref( obj_t* x ); +*/ + #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ -void PASTEMAC(ch,varname)( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx \ + ); -INSERT_GENTPROT_BASIC( trsm_l_ref_4x4 ) +INSERT_GENTPROT_BASIC( invertv_ref ) diff --git a/frame/1/scal2v/bli_scal2v.c b/frame/1/scal2v/bli_scal2v.c index bc7652f74..f9acbb4bd 100644 --- a/frame/1/scal2v/bli_scal2v.c +++ b/frame/1/scal2v/bli_scal2v.c @@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( scal2v, SCAL2V_KERNEL ) +GENFRONT( scal2v, scal2v_kernel ) // diff --git a/frame/1/scal2v/bli_scal2v.h b/frame/1/scal2v/bli_scal2v.h index c3b3f3399..7091f45b2 100644 --- a/frame/1/scal2v/bli_scal2v.h +++ b/frame/1/scal2v/bli_scal2v.h @@ -33,7 +33,8 @@ */ #include "bli_scal2v_check.h" -#include "bli_scal2v_unb_var1.h" +#include "bli_scal2v_kernel.h" +#include "bli_scal2v_ref.h" // diff --git a/frame/1/scal2v/bli_scal2v_kernel.c b/frame/1/scal2v/bli_scal2v_kernel.c new file mode 100644 index 000000000..8fb0690d6 --- /dev/null +++ b/frame/1/scal2v/bli_scal2v_kernel.c @@ -0,0 +1,129 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T scal2v_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + dim_t n, + void* beta, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_kernel_void); +#endif +#endif + + +void bli_scal2v_kernel( obj_t* beta, + obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + + // If beta is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the beta object and extract the buffer at the beta offset. + bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_beta][dt_x][dt_y]; + + // Invoke the function. + f( conjx, + n, + buf_beta, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC3 +#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, kername ) \ +\ +void PASTEMAC3(chb,chx,chy,varname)( \ + conj_t conjx, \ + dim_t n, \ + void* beta, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC3(chb,chx,chy,kername)( conjx, \ + n, \ + beta, \ + x, incx, \ + y, incy ); \ +} + + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3_BASIC( scal2v_kernel_void, SCAL2V_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3_MIX_D( scal2v_kernel_void, SCAL2V_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3_MIX_P( scal2v_kernel_void, SCAL2V_KERNEL ) +#endif + diff --git a/frame/1/scal2v/bli_scal2v_unb_var1.h b/frame/1/scal2v/bli_scal2v_kernel.h similarity index 88% rename from frame/1/scal2v/bli_scal2v_unb_var1.h rename to frame/1/scal2v/bli_scal2v_kernel.h index d9b84bfa1..8ac9c48e1 100644 --- a/frame/1/scal2v/bli_scal2v_unb_var1.h +++ b/frame/1/scal2v/bli_scal2v_kernel.h @@ -32,11 +32,15 @@ */ -void bli_scal2v_unb_var1( obj_t* beta, - obj_t* x, - obj_t* y ); +void bli_scal2v_kernel( obj_t* beta, + obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3 #define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \ \ @@ -48,13 +52,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT3_BASIC( scal2v_unb_var1 ) +INSERT_GENTPROT3_BASIC( scal2v_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( scal2v_unb_var1 ) +INSERT_GENTPROT3_MIX_D( scal2v_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( scal2v_unb_var1 ) +INSERT_GENTPROT3_MIX_P( scal2v_kernel_void ) #endif diff --git a/frame/1/scal2v/bli_scal2v_unb_var1.c b/frame/1/scal2v/bli_scal2v_ref.c similarity index 85% rename from frame/1/scal2v/bli_scal2v_unb_var1.c rename to frame/1/scal2v/bli_scal2v_ref.c index 56df12a99..5cd78414e 100644 --- a/frame/1/scal2v/bli_scal2v_unb_var1.c +++ b/frame/1/scal2v/bli_scal2v_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T scal2v_fp typedef void (*FUNCPTR_T)( @@ -47,17 +48,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_ref); #endif #endif -void bli_scal2v_unb_var1( obj_t* beta, +void bli_scal2v_ref( obj_t* beta, obj_t* x, obj_t* y ) { @@ -94,18 +95,20 @@ void bli_scal2v_unb_var1( obj_t* beta, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC3 #define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, setvker ) \ \ -void PASTEMAC3(chb,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* beta, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC3(chb,chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_b* beta_cast = beta; \ ctype_x* x_cast = x; \ @@ -155,13 +158,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3_BASIC( scal2v_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC3_BASIC( scal2v_ref, SETV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( scal2v_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC3_MIX_D( scal2v_ref, SETV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( scal2v_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC3_MIX_P( scal2v_ref, SETV_KERNEL ) #endif diff --git a/frame/1/scal2v/bli_scal2v_ref.h b/frame/1/scal2v/bli_scal2v_ref.h new file mode 100644 index 000000000..5de497b1f --- /dev/null +++ b/frame/1/scal2v/bli_scal2v_ref.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_scal2v_ref( obj_t* beta, + obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT3 +#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \ +\ +void PASTEMAC3(chb,chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3_BASIC( scal2v_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT3_MIX_D( scal2v_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT3_MIX_P( scal2v_ref ) +#endif + diff --git a/frame/1/scalv/bli_scalv.c b/frame/1/scalv/bli_scalv.c index f017ae388..a1c29dffd 100644 --- a/frame/1/scalv/bli_scalv.c +++ b/frame/1/scalv/bli_scalv.c @@ -66,7 +66,7 @@ void PASTEMAC0(opname)( \ x ); \ } -GENFRONT( scalv, SCALV_KERNEL ) +GENFRONT( scalv, scalv_kernel ) // diff --git a/frame/1/scalv/bli_scalv.h b/frame/1/scalv/bli_scalv.h index 0759b0830..fa58f8b2d 100644 --- a/frame/1/scalv/bli_scalv.h +++ b/frame/1/scalv/bli_scalv.h @@ -36,7 +36,8 @@ #include "bli_scalv_check.h" #include "bli_scalv_int.h" -#include "bli_scalv_unb_var1.h" +#include "bli_scalv_kernel.h" +#include "bli_scalv_ref.h" // diff --git a/frame/1/scalv/bli_scalv_int.c b/frame/1/scalv/bli_scalv_int.c index d59c3d415..cd946c3b1 100644 --- a/frame/1/scalv/bli_scalv_int.c +++ b/frame/1/scalv/bli_scalv_int.c @@ -42,7 +42,7 @@ typedef void (*FUNCPTR_T)( obj_t* beta, static FUNCPTR_T vars[1][3] = { // unblocked optimized unblocked blocked - { bli_scalv_unb_var1, NULL, NULL } + { bli_scalv_kernel, bli_scalv_kernel, NULL } }; void bli_scalv_int( obj_t* beta, diff --git a/frame/1/scalv/bli_scalv_kernel.c b/frame/1/scalv/bli_scalv_kernel.c new file mode 100644 index 000000000..27e91251c --- /dev/null +++ b/frame/1/scalv/bli_scalv_kernel.c @@ -0,0 +1,120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T scalv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjbeta, + dim_t n, + void* beta, + void* x, inc_t incx + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_kernel_void); +#endif +#endif + + +void bli_scalv_kernel( obj_t* beta, + obj_t* x ) +{ + num_t dt_x = bli_obj_datatype( *x ); + + conj_t conjbeta = bli_obj_conj_status( *beta ); + + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + + // If beta is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the beta object and extract the buffer at the beta offset. + bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_beta][dt_x]; + + // Invoke the function. + f( conjbeta, + n, + buf_beta, + buf_x, inc_x ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \ +\ +void PASTEMAC2(chb,chx,varname)( \ + conj_t conjbeta, \ + dim_t n, \ + void* beta, \ + void* x, inc_t incx \ + ) \ +{ \ + PASTEMAC2(chb,chx,kername)( conjbeta, \ + n, \ + beta, \ + x, incx ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( scalv_kernel_void, SCALV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( scalv_kernel_void, SCALV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( scalv_kernel_void, SCALV_KERNEL ) +#endif + diff --git a/frame/1/scalv/bli_scalv_unb_var1.h b/frame/1/scalv/bli_scalv_kernel.h similarity index 89% rename from frame/1/scalv/bli_scalv_unb_var1.h rename to frame/1/scalv/bli_scalv_kernel.h index 94891b828..140630280 100644 --- a/frame/1/scalv/bli_scalv_unb_var1.h +++ b/frame/1/scalv/bli_scalv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_scalv_unb_var1( obj_t* beta, - obj_t* x ); +void bli_scalv_kernel( obj_t* beta, + obj_t* x ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \ \ @@ -46,13 +50,13 @@ void PASTEMAC2(chb,chx,varname)( \ void* x, inc_t incx \ ); -INSERT_GENTPROT2_BASIC( scalv_unb_var1 ) +INSERT_GENTPROT2_BASIC( scalv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( scalv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( scalv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( scalv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( scalv_kernel_void ) #endif diff --git a/frame/1/scalv/bli_scalv_unb_var1.c b/frame/1/scalv/bli_scalv_ref.c similarity index 86% rename from frame/1/scalv/bli_scalv_unb_var1.c rename to frame/1/scalv/bli_scalv_ref.c index bf57cdf97..0bae6eff0 100644 --- a/frame/1/scalv/bli_scalv_unb_var1.c +++ b/frame/1/scalv/bli_scalv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T scalv_fp typedef void (*FUNCPTR_T)( @@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_ref); #endif #endif -void bli_scalv_unb_var1( obj_t* beta, +void bli_scalv_ref( obj_t* beta, obj_t* x ) { num_t dt_x = bli_obj_datatype( *x ); @@ -88,17 +89,19 @@ void bli_scalv_unb_var1( obj_t* beta, buf_beta, buf_x, inc_x ); } +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, setvker ) \ \ -void PASTEMAC2(chb,chx,varname)( \ - conj_t conjbeta, \ - dim_t n, \ - void* beta, \ - void* x, inc_t incx \ - ) \ +void PASTEMAC2(chb,chx,varname) \ + ( \ + conj_t conjbeta, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ) \ { \ ctype_b* beta_cast = beta; \ ctype_x* x_cast = x; \ @@ -136,13 +139,13 @@ void PASTEMAC2(chb,chx,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( scalv_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC2_BASIC( scalv_ref, SETV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( scalv_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC2_MIX_D( scalv_ref, SETV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( scalv_unb_var1, SETV_KERNEL ) +INSERT_GENTFUNC2_MIX_P( scalv_ref, SETV_KERNEL ) #endif diff --git a/frame/1/scalv/bli_scalv_ref.h b/frame/1/scalv/bli_scalv_ref.h new file mode 100644 index 000000000..0bca900cf --- /dev/null +++ b/frame/1/scalv/bli_scalv_ref.h @@ -0,0 +1,61 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_scalv_ref( obj_t* beta, + obj_t* x ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \ +\ +void PASTEMAC2(chb,chx,varname) \ + ( \ + conj_t conjbeta, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT2_BASIC( scalv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( scalv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( scalv_ref ) +#endif + diff --git a/frame/1/setv/bli_setv.c b/frame/1/setv/bli_setv.c index 88dcbe940..33da76bf0 100644 --- a/frame/1/setv/bli_setv.c +++ b/frame/1/setv/bli_setv.c @@ -67,7 +67,7 @@ void PASTEMAC0(opname)( \ x ); \ } -GENFRONT( setv, SETV_KERNEL ) +GENFRONT( setv, setv_kernel ) // diff --git a/frame/1/setv/bli_setv.h b/frame/1/setv/bli_setv.h index c5e35e265..edd6c2897 100644 --- a/frame/1/setv/bli_setv.h +++ b/frame/1/setv/bli_setv.h @@ -34,8 +34,8 @@ #include "bli_setv_check.h" -#include "bli_setv_unb_var1.h" -#include "bli_setv_unb_var2.h" +#include "bli_setv_kernel.h" +#include "bli_setv_ref.h" // diff --git a/frame/1/setv/bli_setv_kernel.c b/frame/1/setv/bli_setv_kernel.c new file mode 100644 index 000000000..6f4c1a827 --- /dev/null +++ b/frame/1/setv/bli_setv_kernel.c @@ -0,0 +1,113 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T setv_fp + +typedef void (*FUNCPTR_T)( + dim_t n, + void* beta, + void* x, inc_t incx + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_kernel_void); +#endif +#endif + + +void bli_setv_kernel( obj_t* beta, + obj_t* x ) +{ + num_t dt_x = bli_obj_datatype( *x ); + + dim_t n = bli_obj_vector_dim( *x ); + + void* buf_x = bli_obj_buffer_at_off( *x ); + inc_t inc_x = bli_obj_vector_inc( *x ); + + void* buf_beta; + num_t dt_beta; + + FUNCPTR_T f; + + // If beta is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the beta object and extract the buffer at the beta offset. + bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_beta][dt_x]; + + // Invoke the function. + f( n, + buf_beta, + buf_x, inc_x ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \ +\ +void PASTEMAC2(chb,chx,varname)( \ + dim_t n, \ + void* beta, \ + void* x, inc_t incx \ + ) \ +{ \ + PASTEMAC2(chb,chx,kername)( n, \ + beta, \ + x, incx ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( setv_kernel_void, SETV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( setv_kernel_void, SETV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( setv_kernel_void, SETV_KERNEL ) +#endif diff --git a/frame/1/setv/bli_setv_unb_var1.h b/frame/1/setv/bli_setv_kernel.h similarity index 89% rename from frame/1/setv/bli_setv_unb_var1.h rename to frame/1/setv/bli_setv_kernel.h index 4f0cebc22..fe6009b5b 100644 --- a/frame/1/setv/bli_setv_unb_var1.h +++ b/frame/1/setv/bli_setv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_setv_unb_var1( obj_t* beta, - obj_t* x ); +void bli_setv_kernel( obj_t* beta, + obj_t* x ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \ \ @@ -45,13 +49,13 @@ void PASTEMAC2(chb,chx,varname)( \ void* x, inc_t incx \ ); -INSERT_GENTPROT2_BASIC( setv_unb_var1 ) +INSERT_GENTPROT2_BASIC( setv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( setv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( setv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( setv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( setv_kernel_void ) #endif diff --git a/frame/1/setv/bli_setv_unb_var1.c b/frame/1/setv/bli_setv_ref.c similarity index 86% rename from frame/1/setv/bli_setv_unb_var1.c rename to frame/1/setv/bli_setv_ref.c index 09d0c8388..6fc59cc1c 100644 --- a/frame/1/setv/bli_setv_unb_var1.c +++ b/frame/1/setv/bli_setv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T setv_fp typedef void (*FUNCPTR_T)( @@ -45,17 +46,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_ref); #endif #endif -void bli_setv_unb_var1( obj_t* beta, +void bli_setv_ref( obj_t* beta, obj_t* x ) { num_t dt_x = bli_obj_datatype( *x ); @@ -84,16 +85,17 @@ void bli_setv_unb_var1( obj_t* beta, buf_beta, buf_x, inc_x ); } - +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_b, ctype_x, chb, chx, opname, varname ) \ \ -void PASTEMAC2(chb,chx,varname)( \ - dim_t n, \ - void* beta, \ - void* x, inc_t incx \ - ) \ +void PASTEMAC2(chb,chx,varname) \ + ( \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ) \ { \ ctype_b* beta_cast = beta; \ ctype_x* chi1 = x; \ @@ -123,12 +125,12 @@ void PASTEMAC2(chb,chx,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( setv, setv_unb_var1 ) +INSERT_GENTFUNC2_BASIC( setv, setv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( setv, setv_unb_var1 ) +INSERT_GENTFUNC2_MIX_D( setv, setv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( setv, setv_unb_var1 ) +INSERT_GENTFUNC2_MIX_P( setv, setv_ref ) #endif diff --git a/frame/1/setv/bli_setv_ref.h b/frame/1/setv/bli_setv_ref.h new file mode 100644 index 000000000..2962004f9 --- /dev/null +++ b/frame/1/setv/bli_setv_ref.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_setv_ref( obj_t* beta, + obj_t* x ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \ +\ +void PASTEMAC2(chb,chx,varname) \ + ( \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT2_BASIC( setv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( setv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( setv_ref ) +#endif + diff --git a/frame/1/setv/bli_setv_unb_var2.c b/frame/1/setv/old/bli_setv_unb_var2.c similarity index 100% rename from frame/1/setv/bli_setv_unb_var2.c rename to frame/1/setv/old/bli_setv_unb_var2.c diff --git a/frame/1/setv/bli_setv_unb_var2.h b/frame/1/setv/old/bli_setv_unb_var2.h similarity index 100% rename from frame/1/setv/bli_setv_unb_var2.h rename to frame/1/setv/old/bli_setv_unb_var2.h diff --git a/frame/1/subv/bli_subv.c b/frame/1/subv/bli_subv.c index f936ceb75..8a9e97ed2 100644 --- a/frame/1/subv/bli_subv.c +++ b/frame/1/subv/bli_subv.c @@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( subv, SUBV_KERNEL ) +GENFRONT( subv, subv_kernel ) // diff --git a/frame/1/subv/bli_subv.h b/frame/1/subv/bli_subv.h index 3706d0aa5..49dd33b89 100644 --- a/frame/1/subv/bli_subv.h +++ b/frame/1/subv/bli_subv.h @@ -33,7 +33,8 @@ */ #include "bli_subv_check.h" -#include "bli_subv_unb_var1.h" +#include "bli_subv_kernel.h" +#include "bli_subv_ref.h" // diff --git a/frame/1/subv/bli_subv_kernel.c b/frame/1/subv/bli_subv_kernel.c new file mode 100644 index 000000000..094f8699c --- /dev/null +++ b/frame/1/subv/bli_subv_kernel.c @@ -0,0 +1,115 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T subv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + dim_t n, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_kernel_void); +#endif +#endif + + +void bli_subv_kernel( obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y]; + + // Invoke the function. + f( conjx, + n, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \ +\ +void PASTEMAC2(chx,chy,varname)( \ + conj_t conjx, \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC2(chx,chy,kername)( conjx, \ + n, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( subv_kernel_void, SUBV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( subv_kernel_void, SUBV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( subv_kernel_void, SUBV_KERNEL ) +#endif + diff --git a/frame/1/subv/bli_subv_unb_var1.h b/frame/1/subv/bli_subv_kernel.h similarity index 89% rename from frame/1/subv/bli_subv_unb_var1.h rename to frame/1/subv/bli_subv_kernel.h index 20b93e16c..a8252316a 100644 --- a/frame/1/subv/bli_subv_unb_var1.h +++ b/frame/1/subv/bli_subv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_subv_unb_var1( obj_t* x, - obj_t* y ); +void bli_subv_kernel( obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ @@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT2_BASIC( subv_unb_var1 ) +INSERT_GENTPROT2_BASIC( subv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( subv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( subv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( subv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( subv_kernel_void ) #endif diff --git a/frame/1/subv/bli_subv_unb_var1.c b/frame/1/subv/bli_subv_ref.c similarity index 85% rename from frame/1/subv/bli_subv_unb_var1.c rename to frame/1/subv/bli_subv_ref.c index 542fb11ef..5e4655044 100644 --- a/frame/1/subv/bli_subv_unb_var1.c +++ b/frame/1/subv/bli_subv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T subv_fp typedef void (*FUNCPTR_T)( @@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,subv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,subv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,subv_ref); #endif #endif -void bli_subv_unb_var1( obj_t* x, +void bli_subv_ref( obj_t* x, obj_t* y ) { num_t dt_x = bli_obj_datatype( *x ); @@ -83,17 +84,19 @@ void bli_subv_unb_var1( obj_t* x, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \ \ -void PASTEMAC2(chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_x* x_cast = x; \ ctype_y* y_cast = y; \ @@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( subv, subv_unb_var1 ) +INSERT_GENTFUNC2_BASIC( subv, subv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( subv, subv_unb_var1 ) +INSERT_GENTFUNC2_MIX_D( subv, subv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( subv, subv_unb_var1 ) +INSERT_GENTFUNC2_MIX_P( subv, subv_ref ) #endif diff --git a/frame/1/subv/bli_subv_ref.h b/frame/1/subv/bli_subv_ref.h new file mode 100644 index 000000000..96abd5f4c --- /dev/null +++ b/frame/1/subv/bli_subv_ref.h @@ -0,0 +1,60 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_subv_ref( obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ +\ +void PASTEMAC2(chx,chy,varname) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( subv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( subv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( subv_ref ) +#endif diff --git a/frame/1/swapv/bli_swapv.c b/frame/1/swapv/bli_swapv.c index 08ba04189..e8fb0625f 100644 --- a/frame/1/swapv/bli_swapv.c +++ b/frame/1/swapv/bli_swapv.c @@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( swapv, SWAPV_KERNEL ) +GENFRONT( swapv, swapv_kernel ) // diff --git a/frame/1/swapv/bli_swapv.h b/frame/1/swapv/bli_swapv.h index bfd2c7a46..8fa54c707 100644 --- a/frame/1/swapv/bli_swapv.h +++ b/frame/1/swapv/bli_swapv.h @@ -33,7 +33,8 @@ */ #include "bli_swapv_check.h" -#include "bli_swapv_unb_var1.h" +#include "bli_swapv_kernel.h" +#include "bli_swapv_ref.h" // diff --git a/frame/1/swapv/bli_swapv_kernel.c b/frame/1/swapv/bli_swapv_kernel.c new file mode 100644 index 000000000..7feb9ee9b --- /dev/null +++ b/frame/1/swapv/bli_swapv_kernel.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T swapv_fp + +typedef void (*FUNCPTR_T)( + dim_t n, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY2_ALL(ftypes,swapv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY2_EXT(ftypes,swapv_kernel_void); +#else +static FUNCPTR_T GENARRAY2_MIN(ftypes,swapv_kernel_void); +#endif +#endif + + +void bli_swapv_kernel( obj_t* x, + obj_t* y ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + FUNCPTR_T f; + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y]; + + // Invoke the function. + f( n, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC2 +#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \ +\ +void PASTEMAC2(chx,chy,varname)( \ + dim_t n, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC2(chx,chy,kername)( n, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC2_BASIC( swapv_kernel_void, SWAPV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC2_MIX_D( swapv_kernel_void, SWAPV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC2_MIX_P( swapv_kernel_void, SWAPV_KERNEL ) +#endif + diff --git a/frame/1/swapv/bli_swapv_unb_var1.h b/frame/1/swapv/bli_swapv_kernel.h similarity index 89% rename from frame/1/swapv/bli_swapv_unb_var1.h rename to frame/1/swapv/bli_swapv_kernel.h index 3d2816d29..4f4536111 100644 --- a/frame/1/swapv/bli_swapv_unb_var1.h +++ b/frame/1/swapv/bli_swapv_kernel.h @@ -32,10 +32,14 @@ */ -void bli_swapv_unb_var1( obj_t* x, - obj_t* y ); +void bli_swapv_kernel( obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ @@ -45,12 +49,12 @@ void PASTEMAC2(chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT2_BASIC( swapv_unb_var1 ) +INSERT_GENTPROT2_BASIC( swapv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT2_MIX_D( swapv_unb_var1 ) +INSERT_GENTPROT2_MIX_D( swapv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT2_MIX_P( swapv_unb_var1 ) +INSERT_GENTPROT2_MIX_P( swapv_kernel_void ) #endif diff --git a/frame/1/swapv/bli_swapv_unb_var1.c b/frame/1/swapv/bli_swapv_ref.c similarity index 85% rename from frame/1/swapv/bli_swapv_unb_var1.c rename to frame/1/swapv/bli_swapv_ref.c index 05937dd71..b1d77d7b8 100644 --- a/frame/1/swapv/bli_swapv_unb_var1.c +++ b/frame/1/swapv/bli_swapv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T swapv_fp typedef void (*FUNCPTR_T)( @@ -45,17 +46,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY2_ALL(ftypes,swapv_unb_var1); +static FUNCPTR_T GENARRAY2_ALL(ftypes,swapv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY2_EXT(ftypes,swapv_unb_var1); +static FUNCPTR_T GENARRAY2_EXT(ftypes,swapv_ref); #else -static FUNCPTR_T GENARRAY2_MIN(ftypes,swapv_unb_var1); +static FUNCPTR_T GENARRAY2_MIN(ftypes,swapv_ref); #endif #endif -void bli_swapv_unb_var1( obj_t* x, +void bli_swapv_ref( obj_t* x, obj_t* y ) { num_t dt_x = bli_obj_datatype( *x ); @@ -80,16 +81,18 @@ void bli_swapv_unb_var1( obj_t* x, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \ \ -void PASTEMAC2(chx,chy,varname)( \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC2(chx,chy,varname) \ + ( \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_x* x_cast = x; \ ctype_y* y_cast = y; \ @@ -113,13 +116,13 @@ void PASTEMAC2(chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC2_BASIC( swapv, swapv_unb_var1 ) +INSERT_GENTFUNC2_BASIC( swapv, swapv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC2_MIX_D( swapv, swapv_unb_var1 ) +INSERT_GENTFUNC2_MIX_D( swapv, swapv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC2_MIX_P( swapv, swapv_unb_var1 ) +INSERT_GENTFUNC2_MIX_P( swapv, swapv_ref ) #endif diff --git a/frame/1/swapv/bli_swapv_ref.h b/frame/1/swapv/bli_swapv_ref.h new file mode 100644 index 000000000..2ec73336c --- /dev/null +++ b/frame/1/swapv/bli_swapv_ref.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_swapv_ref( obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ +\ +void PASTEMAC2(chx,chy,varname) \ + ( \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( swapv_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT2_MIX_D( swapv_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT2_MIX_P( swapv_ref ) +#endif diff --git a/frame/1f/axpy2v/bli_axpy2v.c b/frame/1f/axpy2v/bli_axpy2v.c index 8ae7d902c..f7c4bf2af 100644 --- a/frame/1f/axpy2v/bli_axpy2v.c +++ b/frame/1f/axpy2v/bli_axpy2v.c @@ -35,7 +35,6 @@ #include "blis.h" -/* // // Define object-based interface. // @@ -60,8 +59,7 @@ void PASTEMAC0(opname)( \ z ); \ } -GENFRONT( axpy2v, AXPY2V_KERNEL ) -*/ +GENFRONT( axpy2v, axpy2v_kernel ) // @@ -123,7 +121,6 @@ void PASTEMAC3(chx,chy,chz,opname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -//INSERT_GENTFUNC3U12_BASIC( axpy2v, axpy2v_unb_var1 ) INSERT_GENTFUNC3U12_BASIC( axpy2v, AXPY2V_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT diff --git a/frame/1f/axpy2v/bli_axpy2v.h b/frame/1f/axpy2v/bli_axpy2v.h index 678005dcf..9a758974a 100644 --- a/frame/1f/axpy2v/bli_axpy2v.h +++ b/frame/1f/axpy2v/bli_axpy2v.h @@ -33,7 +33,8 @@ */ #include "bli_axpy2v_check.h" -#include "bli_axpy2v_unb_var1.h" +#include "bli_axpy2v_kernel.h" +#include "bli_axpy2v_ref.h" // diff --git a/frame/1f/axpy2v/bli_axpy2v_kernel.c b/frame/1f/axpy2v/bli_axpy2v_kernel.c new file mode 100644 index 000000000..6420eca65 --- /dev/null +++ b/frame/1f/axpy2v/bli_axpy2v_kernel.c @@ -0,0 +1,150 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T axpy2v_fp + +typedef void (*FUNCPTR_T)( + conj_t conjx, + conj_t conjy, + dim_t n, + void* alpha1, + void* alpha2, + void* x, inc_t incx, + void* y, inc_t incy, + void* z, inc_t incz + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_kernel_void); +#endif +#endif + + +void bli_axpy2v_kernel( obj_t* alpha1, + obj_t* alpha2, + obj_t* x, + obj_t* y, + obj_t* z ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjx = bli_obj_conj_status( *x ); + conj_t conjy = bli_obj_conj_status( *y ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + inc_t inc_z = bli_obj_vector_inc( *z ); + void* buf_z = bli_obj_buffer_at_off( *z ); + + num_t dt_alpha1; + void* buf_alpha1; + + num_t dt_alpha2; + void* buf_alpha2; + + FUNCPTR_T f; + + // If alpha is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the alpha object and extract the buffer at the alpha offset. + bli_set_scalar_dt_buffer( alpha1, dt_x, dt_alpha1, buf_alpha1 ); + bli_set_scalar_dt_buffer( alpha2, dt_x, dt_alpha2, buf_alpha2 ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_alpha1][dt_x][dt_y]; + + // Invoke the function. + f( conjx, + conjy, + n, + buf_alpha1, + buf_alpha2, + buf_x, inc_x, + buf_y, inc_y, + buf_z, inc_z ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \ +\ +void PASTEMAC3(chx,chy,chz,varname)( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + void* alpha1, \ + void* alpha2, \ + void* x, inc_t incx, \ + void* y, inc_t incy, \ + void* z, inc_t incz \ + ) \ +{ \ + PASTEMAC3(chx,chy,chz,kername)( conjx, \ + conjy, \ + n, \ + alpha1, \ + alpha2, \ + x, incx, \ + y, incy, \ + z, incz ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( axpy2v_kernel_void, AXPY2V_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( axpy2v_kernel_void, AXPY2V_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( axpy2v_kernel_void, AXPY2V_KERNEL ) +#endif + diff --git a/frame/1f/axpy2v/bli_axpy2v_unb_var1.h b/frame/1f/axpy2v/bli_axpy2v_kernel.h similarity index 86% rename from frame/1f/axpy2v/bli_axpy2v_unb_var1.h rename to frame/1f/axpy2v/bli_axpy2v_kernel.h index 1613033cf..0bd10485d 100644 --- a/frame/1f/axpy2v/bli_axpy2v_unb_var1.h +++ b/frame/1f/axpy2v/bli_axpy2v_kernel.h @@ -32,13 +32,17 @@ */ -void bli_axpy2v_unb_var1( obj_t* alpha1, - obj_t* alpha2, - obj_t* x, - obj_t* y, - obj_t* z ); +void bli_axpy2v_kernel( obj_t* alpha1, + obj_t* alpha2, + obj_t* x, + obj_t* y, + obj_t* z ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3 #define GENTPROT3( ctype_x, ctype_y, ctype_z, chx, chy, chz, varname ) \ \ @@ -53,13 +57,13 @@ void PASTEMAC3(chx,chy,chz,varname)( \ void* z, inc_t incz \ ); -INSERT_GENTPROT3_BASIC( axpy2v_unb_var1 ) +INSERT_GENTPROT3_BASIC( axpy2v_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpy2v_unb_var1 ) +INSERT_GENTPROT3_MIX_D( axpy2v_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpy2v_unb_var1 ) +INSERT_GENTPROT3_MIX_P( axpy2v_kernel_void ) #endif diff --git a/frame/1f/axpy2v/bli_axpy2v_unb_var1.c b/frame/1f/axpy2v/bli_axpy2v_ref.c similarity index 83% rename from frame/1f/axpy2v/bli_axpy2v_unb_var1.c rename to frame/1f/axpy2v/bli_axpy2v_ref.c index 755a6d860..f999bf86f 100644 --- a/frame/1f/axpy2v/bli_axpy2v_unb_var1.c +++ b/frame/1f/axpy2v/bli_axpy2v_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T axpy2v_fp typedef void (*FUNCPTR_T)( @@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_ref); #endif #endif -void bli_axpy2v_unb_var1( obj_t* alpha1, +void bli_axpy2v_ref( obj_t* alpha1, obj_t* alpha2, obj_t* x, obj_t* y, @@ -110,21 +111,23 @@ void bli_axpy2v_unb_var1( obj_t* alpha1, buf_y, inc_y, buf_z, inc_z ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \ \ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha1, \ - void* alpha2, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ +void PASTEMAC3(chx,chy,chz,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha1, \ + ctype_xy* restrict alpha2, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_z* restrict z, inc_t incz \ + ) \ { \ ctype_xy* alpha1_cast = alpha1; \ ctype_xy* alpha2_cast = alpha2; \ @@ -146,13 +149,13 @@ void PASTEMAC3(chx,chy,chz,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC( axpy2v_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_BASIC( axpy2v_ref, AXPYV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpy2v_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D( axpy2v_ref, AXPYV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpy2v_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P( axpy2v_ref, AXPYV_KERNEL ) #endif diff --git a/config/template/kernels/1f/bli_axpy2v_opt_var1.h b/frame/1f/axpy2v/bli_axpy2v_ref.h similarity index 70% rename from config/template/kernels/1f/bli_axpy2v_opt_var1.h rename to frame/1f/axpy2v/bli_axpy2v_ref.h index 3cfd0a169..34c0cb06d 100644 --- a/config/template/kernels/1f/bli_axpy2v_opt_var1.h +++ b/frame/1f/axpy2v/bli_axpy2v_ref.h @@ -32,31 +32,37 @@ */ +/* +void bli_axpy2v_ref( obj_t* alpha1, + obj_t* alpha2, + obj_t* x, + obj_t* y, + obj_t* z ); +*/ + -// -// Prototype kernel interfaces. -// #undef GENTPROT3U12 #define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \ \ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype_xy* restrict alpha1, \ - ctype_xy* restrict alpha2, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_z* restrict z, inc_t incz \ - ); +void PASTEMAC3(chx,chy,chz,varname) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha1, \ + ctype_xy* restrict alpha2, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_z* restrict z, inc_t incz \ + ); -INSERT_GENTPROT3U12_BASIC( axpy2v_opt_var1 ) +INSERT_GENTPROT3U12_BASIC( axpy2v_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( axpy2v_opt_var1 ) +INSERT_GENTPROT3U12_MIX_D( axpy2v_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( axpy2v_opt_var1 ) +INSERT_GENTPROT3U12_MIX_P( axpy2v_ref ) #endif diff --git a/frame/1f/axpyf/bli_axpyf.c b/frame/1f/axpyf/bli_axpyf.c index 83fb6a1c3..24e8ca79e 100644 --- a/frame/1f/axpyf/bli_axpyf.c +++ b/frame/1f/axpyf/bli_axpyf.c @@ -35,7 +35,6 @@ #include "blis.h" -/* // // Define object-based interface. // @@ -68,8 +67,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( axpyf, AXPYF_KERNEL ) -*/ +GENFRONT( axpyf, axpyf_kernel ) // diff --git a/frame/1f/axpyf/bli_axpyf.h b/frame/1f/axpyf/bli_axpyf.h index f24d1cf01..c5590c1e5 100644 --- a/frame/1f/axpyf/bli_axpyf.h +++ b/frame/1f/axpyf/bli_axpyf.h @@ -34,7 +34,8 @@ #include "bli_axpyf_check.h" #include "bli_axpyf_fusefac.h" -#include "bli_axpyf_unb_var1.h" +#include "bli_axpyf_kernel.h" +#include "bli_axpyf_ref.h" // diff --git a/frame/1f/axpyf/bli_axpyf_kernel.c b/frame/1f/axpyf/bli_axpyf_kernel.c new file mode 100644 index 000000000..43ed77ae5 --- /dev/null +++ b/frame/1f/axpyf/bli_axpyf_kernel.c @@ -0,0 +1,149 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T axpyf_fp + +typedef void (*FUNCPTR_T)( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + void* alpha, + void* a, inc_t inca, inc_t lda, + void* x, inc_t incx, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_kernel_void); +#endif +#endif + + +void bli_axpyf_kernel( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* y ) +{ + num_t dt_a = bli_obj_datatype( *a ); + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conja = bli_obj_conj_status( *a ); + conj_t conjx = bli_obj_conj_status( *x ); + + dim_t m = bli_obj_vector_dim( *y ); + dim_t b_n = bli_obj_vector_dim( *x ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + inc_t rs_a = bli_obj_row_stride( *a ); + inc_t cs_a = bli_obj_col_stride( *a ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + num_t dt_alpha; + void* buf_alpha; + + FUNCPTR_T f; + + // The datatype of alpha MUST be the type union of a and x. This is to + // prevent any unnecessary loss of information during computation. + dt_alpha = bli_datatype_union( dt_a, dt_x ); + buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_a][dt_x][dt_y]; + + // Invoke the function. + f( conja, + conjx, + m, + b_n, + buf_alpha, + buf_a, rs_a, cs_a, + buf_x, inc_x, + buf_y, inc_y ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,varname)( \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + void* alpha, \ + void* a, inc_t inca, inc_t lda, \ + void* x, inc_t incx, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC3(cha,chx,chy,kername)( conja, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( axpyf_kernel_void, AXPYF_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( axpyf_kernel_void, AXPYF_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( axpyf_kernel_void, AXPYF_KERNEL ) +#endif + diff --git a/frame/1f/axpyf/bli_axpyf_unb_var1.h b/frame/1f/axpyf/bli_axpyf_kernel.h similarity index 87% rename from frame/1f/axpyf/bli_axpyf_unb_var1.h rename to frame/1f/axpyf/bli_axpyf_kernel.h index b1057d513..3dca6b5ba 100644 --- a/frame/1f/axpyf/bli_axpyf_unb_var1.h +++ b/frame/1f/axpyf/bli_axpyf_kernel.h @@ -32,12 +32,16 @@ */ -void bli_axpyf_unb_var1( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* y ); +void bli_axpyf_kernel( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ \ @@ -52,13 +56,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT3U12_BASIC( axpyf_unb_var1 ) +INSERT_GENTPROT3U12_BASIC( axpyf_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( axpyf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_D( axpyf_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( axpyf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_P( axpyf_kernel_void ) #endif diff --git a/frame/1f/axpyf/bli_axpyf_unb_var1.c b/frame/1f/axpyf/bli_axpyf_ref.c similarity index 84% rename from frame/1f/axpyf/bli_axpyf_unb_var1.c rename to frame/1f/axpyf/bli_axpyf_ref.c index 7abe8eb78..1264717c3 100644 --- a/frame/1f/axpyf/bli_axpyf_unb_var1.c +++ b/frame/1f/axpyf/bli_axpyf_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T axpyf_fp typedef void (*FUNCPTR_T)( @@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyf_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyf_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyf_ref); #endif #endif -void bli_axpyf_unb_var1( obj_t* alpha, +void bli_axpyf_ref( obj_t* alpha, obj_t* a, obj_t* x, obj_t* y ) @@ -109,21 +110,23 @@ void bli_axpyf_unb_var1( obj_t* alpha, buf_x, inc_x, buf_y, inc_y ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_ax* alpha_cast = alpha; \ ctype_a* a_cast = a; \ @@ -154,13 +157,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC( axpyf_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_BASIC( axpyf_ref, AXPYV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpyf_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D( axpyf_ref, AXPYV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpyf_unb_var1, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P( axpyf_ref, AXPYV_KERNEL ) #endif diff --git a/frame/1f/axpyf/bli_axpyf_ref.h b/frame/1f/axpyf/bli_axpyf_ref.h new file mode 100644 index 000000000..8f56a230d --- /dev/null +++ b/frame/1f/axpyf/bli_axpyf_ref.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* +void bli_axpyf_ref( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* y ); +*/ + + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ +\ +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3U12_BASIC( axpyf_ref ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTPROT3U12_MIX_D( axpyf_ref ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTPROT3U12_MIX_P( axpyf_ref ) +#endif + diff --git a/frame/1f/dotaxpyv/bli_dotaxpyv.c b/frame/1f/dotaxpyv/bli_dotaxpyv.c index 965b76bf4..68500110b 100644 --- a/frame/1f/dotaxpyv/bli_dotaxpyv.c +++ b/frame/1f/dotaxpyv/bli_dotaxpyv.c @@ -35,7 +35,6 @@ #include "blis.h" -/* // // Define object-based interface. // @@ -62,8 +61,7 @@ void PASTEMAC0(opname)( \ z ); \ } -GENFRONT( dotaxpyv, DOTAXPYV_KERNEL ) -*/ +GENFRONT( dotaxpyv, dotaxpyv_kernel ) // diff --git a/frame/1f/dotaxpyv/bli_dotaxpyv.h b/frame/1f/dotaxpyv/bli_dotaxpyv.h index 81412be05..e0831bd7c 100644 --- a/frame/1f/dotaxpyv/bli_dotaxpyv.h +++ b/frame/1f/dotaxpyv/bli_dotaxpyv.h @@ -33,7 +33,8 @@ */ #include "bli_dotaxpyv_check.h" -#include "bli_dotaxpyv_unb_var1.h" +#include "bli_dotaxpyv_kernel.h" +#include "bli_dotaxpyv_ref.h" // diff --git a/frame/1f/dotaxpyv/bli_dotaxpyv_kernel.c b/frame/1f/dotaxpyv/bli_dotaxpyv_kernel.c new file mode 100644 index 000000000..42814336a --- /dev/null +++ b/frame/1f/dotaxpyv/bli_dotaxpyv_kernel.c @@ -0,0 +1,155 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T dotaxpyv_fp + +typedef void (*FUNCPTR_T)( + conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + void* alpha, + void* x, inc_t incx, + void* y, inc_t incy, + void* rho, + void* z, inc_t incz + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotaxpyv_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotaxpyv_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotaxpyv_kernel_void); +#endif +#endif + + +void bli_dotaxpyv_kernel( obj_t* alpha, + obj_t* xt, + obj_t* x, + obj_t* y, + obj_t* rho, + obj_t* z ) +{ + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + num_t dt_z = bli_obj_datatype( *z ); + + conj_t conjxt = bli_obj_conj_status( *xt ); + conj_t conjx = bli_obj_conj_status( *x ); + conj_t conjy = bli_obj_conj_status( *y ); + dim_t n = bli_obj_vector_dim( *x ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + inc_t inc_z = bli_obj_vector_inc( *z ); + void* buf_z = bli_obj_buffer_at_off( *z ); + + void* buf_rho = bli_obj_buffer_at_off( *rho ); + + num_t dt_alpha; + void* buf_alpha; + + FUNCPTR_T f; + + // If alpha is a scalar constant, use dt_x to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the alpha object and extract the buffer at the alpha offset. + bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_x][dt_y][dt_z]; + + // Invoke the function. + f( conjxt, + conjx, + conjy, + n, + buf_alpha, + buf_x, inc_x, + buf_y, inc_y, + buf_rho, + buf_z, inc_z ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \ +\ +void PASTEMAC3(chx,chy,chz,varname)( \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + void* alpha, \ + void* x, inc_t incx, \ + void* y, inc_t incy, \ + void* rho, \ + void* z, inc_t incz \ + ) \ +{ \ + PASTEMAC3(chx,chy,chz,kername)( conjxt, \ + conjx, \ + conjy, \ + m, \ + alpha, \ + x, incx, \ + y, incy, \ + rho, \ + z, incz ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( dotaxpyv_kernel_void, DOTAXPYV_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_kernel_void, DOTAXPYV_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_kernel_void, DOTAXPYV_KERNEL ) +#endif + diff --git a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.h b/frame/1f/dotaxpyv/bli_dotaxpyv_kernel.h similarity index 84% rename from frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.h rename to frame/1f/dotaxpyv/bli_dotaxpyv_kernel.h index db27b5e9e..bf1d5f097 100644 --- a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.h +++ b/frame/1f/dotaxpyv/bli_dotaxpyv_kernel.h @@ -32,14 +32,18 @@ */ -void bli_dotaxpyv_unb_var1( obj_t* alpha, - obj_t* xt, - obj_t* x, - obj_t* y, - obj_t* rho, - obj_t* z ); +void bli_dotaxpyv_kernel( obj_t* alpha, + obj_t* xt, + obj_t* x, + obj_t* y, + obj_t* rho, + obj_t* z ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \ \ @@ -55,13 +59,13 @@ void PASTEMAC3(chx,chy,chz,varname)( \ void* z, inc_t incz \ ); -INSERT_GENTPROT3U12_BASIC( dotaxpyv_unb_var1 ) +INSERT_GENTPROT3U12_BASIC( dotaxpyv_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotaxpyv_unb_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotaxpyv_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotaxpyv_unb_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotaxpyv_kernel_void ) #endif diff --git a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.c b/frame/1f/dotaxpyv/bli_dotaxpyv_ref.c similarity index 83% rename from frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.c rename to frame/1f/dotaxpyv/bli_dotaxpyv_ref.c index 5daed735a..1bd3e0735 100644 --- a/frame/1f/dotaxpyv/bli_dotaxpyv_unb_var1.c +++ b/frame/1f/dotaxpyv/bli_dotaxpyv_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotaxpyv_fp typedef void (*FUNCPTR_T)( @@ -51,17 +52,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotaxpyv_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotaxpyv_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotaxpyv_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotaxpyv_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotaxpyv_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotaxpyv_ref); #endif #endif -void bli_dotaxpyv_unb_var1( obj_t* alpha, +void bli_dotaxpyv_ref( obj_t* alpha, obj_t* xt, obj_t* x, obj_t* y, @@ -113,22 +114,24 @@ void bli_dotaxpyv_unb_var1( obj_t* alpha, buf_rho, buf_z, inc_z ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, dotxvker, axpyvker ) \ \ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho, \ - void* z, inc_t incz \ - ) \ +void PASTEMAC3(chx,chy,chz,varname) \ + ( \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + ctype_x* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_xy* restrict rho, \ + ctype_z* restrict z, inc_t incz \ + ) \ { \ ctype_xy* one = PASTEMAC(chxy,1); \ ctype_xy* zero = PASTEMAC(chxy,0); \ @@ -155,13 +158,13 @@ void PASTEMAC3(chx,chy,chz,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_BASIC2( dotaxpyv_ref, DOTXV_KERNEL, AXPYV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D2( dotaxpyv_ref, DOTXV_KERNEL, AXPYV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P2( dotaxpyv_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P2( dotaxpyv_ref, DOTXV_KERNEL, AXPYV_KERNEL ) #endif diff --git a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.h b/frame/1f/dotaxpyv/bli_dotaxpyv_ref.h similarity index 68% rename from config/template/kernels/1f/bli_dotaxpyv_opt_var1.h rename to frame/1f/dotaxpyv/bli_dotaxpyv_ref.h index 557ee965f..ffdaa7d0c 100644 --- a/config/template/kernels/1f/bli_dotaxpyv_opt_var1.h +++ b/frame/1f/dotaxpyv/bli_dotaxpyv_ref.h @@ -32,32 +32,39 @@ */ +/* +void bli_dotaxpyv_ref( obj_t* alpha, + obj_t* xt, + obj_t* x, + obj_t* y, + obj_t* rho, + obj_t* z ); +*/ + -// -// Prototype kernel interfaces. -// #undef GENTPROT3U12 #define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \ \ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* restrict alpha, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_xy* restrict rho, \ - ctype_z* restrict z, inc_t incz \ - ); +void PASTEMAC3(chx,chy,chz,varname) \ + ( \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + ctype_x* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_xy* restrict rho, \ + ctype_z* restrict z, inc_t incz \ + ); -INSERT_GENTPROT3U12_BASIC( dotaxpyv_opt_var1 ) +INSERT_GENTPROT3U12_BASIC( dotaxpyv_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotaxpyv_opt_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotaxpyv_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotaxpyv_opt_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotaxpyv_ref ) #endif diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf.c b/frame/1f/dotxaxpyf/bli_dotxaxpyf.c index 5a75668d5..a65936203 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf.c +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf.c @@ -35,7 +35,6 @@ #include "blis.h" -/* // // Define object-based interface. // @@ -66,8 +65,7 @@ void PASTEMAC0(opname)( \ z ); \ } -GENFRONT( dotxaxpyf, DOTXAXPYF_KERNEL ) -*/ +GENFRONT( dotxaxpyf, dotxaxpyf_kernel ) // diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf.h b/frame/1f/dotxaxpyf/bli_dotxaxpyf.h index dee152c1c..5a2fd7f8b 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf.h +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf.h @@ -34,8 +34,9 @@ #include "bli_dotxaxpyf_check.h" #include "bli_dotxaxpyf_fusefac.h" -#include "bli_dotxaxpyf_unb_var1.h" -#include "bli_dotxaxpyf_unb_var2.h" +#include "bli_dotxaxpyf_kernel.h" +#include "bli_dotxaxpyf_ref_var1.h" +#include "bli_dotxaxpyf_ref_var2.h" // diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.c b/frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.c new file mode 100644 index 000000000..babf08b99 --- /dev/null +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.c @@ -0,0 +1,188 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T dotxaxpyf_fp + +typedef void (*FUNCPTR_T)( + conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + void* alpha, + void* a, inc_t inca, inc_t lda, + void* w, inc_t incw, + void* x, inc_t incx, + void* beta, + void* y, inc_t incy, + void* z, inc_t incz + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxaxpyf_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxaxpyf_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxaxpyf_kernel_void); +#endif +#endif + + +void bli_dotxaxpyf_kernel( obj_t* alpha, + obj_t* at, + obj_t* a, + obj_t* w, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* z ) +{ + num_t dt_a = bli_obj_datatype( *a ); + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjat = bli_obj_conj_status( *at ); + conj_t conja = bli_obj_conj_status( *a ); + conj_t conjw = bli_obj_conj_status( *w ); + conj_t conjx = bli_obj_conj_status( *x ); + + dim_t m = bli_obj_vector_dim( *z ); + dim_t b_n = bli_obj_vector_dim( *y ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + inc_t rs_a = bli_obj_row_stride( *a ); + inc_t cs_a = bli_obj_col_stride( *a ); + + inc_t inc_w = bli_obj_vector_inc( *w ); + void* buf_w = bli_obj_buffer_at_off( *w ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + inc_t inc_z = bli_obj_vector_inc( *z ); + void* buf_z = bli_obj_buffer_at_off( *z ); + + num_t dt_alpha; + void* buf_alpha; + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + + // The datatype of alpha MUST be the type union of a and x. This is to + // prevent any unnecessary loss of information during computation. + dt_alpha = bli_datatype_union( dt_a, dt_x ); + buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); + + // The datatype of beta MUST be the same as the datatype of y. + dt_beta = dt_y; + buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_a][dt_x][dt_y]; + + // Invoke the function. + f( conjat, + conja, + conjw, + conjx, + m, + b_n, + buf_alpha, + buf_a, rs_a, cs_a, + buf_w, inc_w, + buf_x, inc_x, + buf_beta, + buf_y, inc_y, + buf_z, inc_z ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \ +\ +void PASTEMAC3(cha,chb,chc,varname)( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + void* alpha, \ + void* a, inc_t inca, inc_t lda, \ + void* w, inc_t incw, \ + void* x, inc_t incx, \ + void* beta, \ + void* y, inc_t incy, \ + void* z, inc_t incz \ + ) \ +{ \ + PASTEMAC3(cha,chb,chc,kername)( conjat, \ + conja, \ + conjw, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + w, incw, \ + x, incx, \ + beta, \ + y, incy, \ + z, incz ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( dotxaxpyf_kernel_void, DOTXAXPYF_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_kernel_void, DOTXAXPYF_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_kernel_void, DOTXAXPYF_KERNEL ) +#endif + diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.h b/frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.h similarity index 83% rename from frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.h rename to frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.h index 8fc183d3f..eeb4a10eb 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.h +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_kernel.h @@ -32,16 +32,20 @@ */ -void bli_dotxaxpyf_unb_var1( obj_t* alpha, - obj_t* at, - obj_t* a, - obj_t* w, - obj_t* x, - obj_t* beta, - obj_t* y, - obj_t* z ); +void bli_dotxaxpyf_kernel( obj_t* alpha, + obj_t* at, + obj_t* a, + obj_t* w, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* z ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \ \ @@ -61,13 +65,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \ void* z, inc_t incz \ ); -INSERT_GENTPROT3U12_BASIC( dotxaxpyf_unb_var1 ) +INSERT_GENTPROT3U12_BASIC( dotxaxpyf_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_kernel_void ) #endif diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.c similarity index 84% rename from frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c rename to frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.c index ce2a5d24e..b43828076 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var1.c +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotxaxpyf_fp typedef void (*FUNCPTR_T)( @@ -55,17 +56,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxaxpyf_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxaxpyf_ref_var1); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxaxpyf_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxaxpyf_ref_var1); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxaxpyf_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxaxpyf_ref_var1); #endif #endif -void bli_dotxaxpyf_unb_var1( obj_t* alpha, +void bli_dotxaxpyf_ref_var1( obj_t* alpha, obj_t* at, obj_t* a, obj_t* w, @@ -138,26 +139,28 @@ void bli_dotxaxpyf_unb_var1( obj_t* alpha, buf_y, inc_y, buf_z, inc_z ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, dotxvker, axpyvker ) \ \ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* w, inc_t incw, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ +void PASTEMAC3(cha,chb,chc,varname) \ + ( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ab* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_b* restrict w, inc_t incw, \ + ctype_b* restrict x, inc_t incx, \ + ctype_c* restrict beta, \ + ctype_c* restrict y, inc_t incy, \ + ctype_c* restrict z, inc_t incz \ + ) \ { \ ctype_ab* alpha_cast = alpha; \ ctype_a* a_cast = a; \ @@ -212,13 +215,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_ref_var1, DOTXV_KERNEL, AXPYV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_ref_var1, DOTXV_KERNEL, AXPYV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_unb_var1, DOTXV_KERNEL, AXPYV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_ref_var1, DOTXV_KERNEL, AXPYV_KERNEL ) #endif diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.h b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.h similarity index 68% rename from frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.h rename to frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.h index 3d97d1386..5315a8a14 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.h +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var1.h @@ -32,7 +32,8 @@ */ -void bli_dotxaxpyf_unb_var2( obj_t* alpha, +/* +void bli_dotxaxpyf_ref_var1( obj_t* alpha, obj_t* at, obj_t* a, obj_t* w, @@ -40,34 +41,36 @@ void bli_dotxaxpyf_unb_var2( obj_t* alpha, obj_t* beta, obj_t* y, obj_t* z ); +*/ #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \ \ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* w, inc_t incw, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ); +void PASTEMAC3(cha,chb,chc,varname) \ + ( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ab* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_b* restrict w, inc_t incw, \ + ctype_b* restrict x, inc_t incx, \ + ctype_c* restrict beta, \ + ctype_c* restrict y, inc_t incy, \ + ctype_c* restrict z, inc_t incz \ + ); -INSERT_GENTPROT3U12_BASIC( dotxaxpyf_unb_var2 ) +INSERT_GENTPROT3U12_BASIC( dotxaxpyf_ref_var1 ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_unb_var2 ) +INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_ref_var1 ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_unb_var2 ) +INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_ref_var1 ) #endif diff --git a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.c b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.c similarity index 83% rename from frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.c rename to frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.c index 67155dbaf..7aa70ae0c 100644 --- a/frame/1f/dotxaxpyf/bli_dotxaxpyf_unb_var2.c +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotxaxpyf_fp typedef void (*FUNCPTR_T)( @@ -55,17 +56,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxaxpyf_unb_var2); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxaxpyf_ref_var2); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxaxpyf_unb_var2); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxaxpyf_ref_var2); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxaxpyf_unb_var2); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxaxpyf_ref_var2); #endif #endif -void bli_dotxaxpyf_unb_var2( obj_t* alpha, +void bli_dotxaxpyf_ref_var2( obj_t* alpha, obj_t* at, obj_t* a, obj_t* w, @@ -138,26 +139,28 @@ void bli_dotxaxpyf_unb_var2( obj_t* alpha, buf_y, inc_y, buf_z, inc_z ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, dotxfker, axpyfker ) \ \ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* w, inc_t incw, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ +void PASTEMAC3(cha,chb,chc,varname) \ + ( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ab* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_b* restrict w, inc_t incw, \ + ctype_b* restrict x, inc_t incx, \ + ctype_c* restrict beta, \ + ctype_c* restrict y, inc_t incy, \ + ctype_c* restrict z, inc_t incz \ + ) \ { \ ctype_ab* alpha_cast = alpha; \ ctype_a* a_cast = a; \ @@ -193,13 +196,13 @@ void PASTEMAC3(cha,chb,chc,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_unb_var2, DOTXF_KERNEL, AXPYF_KERNEL ) +INSERT_GENTFUNC3U12_BASIC2( dotxaxpyf_ref_var2, DOTXF_KERNEL, AXPYF_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_unb_var2, DOTXF_KERNEL, AXPYF_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D2( dotxaxpyf_ref_var2, DOTXF_KERNEL, AXPYF_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_unb_var2, DOTXF_KERNEL, AXPYF_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P2( dotxaxpyf_ref_var2, DOTXF_KERNEL, AXPYF_KERNEL ) #endif diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.h b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.h similarity index 62% rename from kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.h rename to frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.h index 6562710d6..742f56981 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.h +++ b/frame/1f/dotxaxpyf/bli_dotxaxpyf_ref_var2.h @@ -32,33 +32,43 @@ */ +void bli_dotxaxpyf_ref_var2( obj_t* alpha, + obj_t* at, + obj_t* a, + obj_t* w, + obj_t* x, + obj_t* beta, + obj_t* y, + obj_t* z ); + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \ \ -void PASTEMAC3(cha,chb,chc,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_b* restrict w, inc_t incw, \ - ctype_b* restrict x, inc_t incx, \ - ctype_c* restrict beta, \ - ctype_c* restrict y, inc_t incy, \ - ctype_c* restrict z, inc_t incz \ - ); +void PASTEMAC3(cha,chb,chc,varname) \ + ( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ab* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_b* restrict w, inc_t incw, \ + ctype_b* restrict x, inc_t incx, \ + ctype_c* restrict beta, \ + ctype_c* restrict y, inc_t incy, \ + ctype_c* restrict z, inc_t incz \ + ); -INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 ) +INSERT_GENTPROT3U12_BASIC( dotxaxpyf_ref_var2 ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_ref_var2 ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_ref_var2 ) #endif diff --git a/frame/1f/dotxf/bli_dotxf.c b/frame/1f/dotxf/bli_dotxf.c index 4f358f3b6..4f449a0b6 100644 --- a/frame/1f/dotxf/bli_dotxf.c +++ b/frame/1f/dotxf/bli_dotxf.c @@ -35,7 +35,6 @@ #include "blis.h" -/* // // Define object-based interface. // @@ -70,8 +69,7 @@ void PASTEMAC0(opname)( \ y ); \ } -GENFRONT( dotxf, DOTXF_KERNEL ) -*/ +GENFRONT( dotxf, dotxf_kernel ) // diff --git a/frame/1f/dotxf/bli_dotxf.h b/frame/1f/dotxf/bli_dotxf.h index 412f75deb..06c9d151b 100644 --- a/frame/1f/dotxf/bli_dotxf.h +++ b/frame/1f/dotxf/bli_dotxf.h @@ -34,7 +34,8 @@ #include "bli_dotxf_check.h" #include "bli_dotxf_fusefac.h" -#include "bli_dotxf_unb_var1.h" +#include "bli_dotxf_kernel.h" +#include "bli_dotxf_ref.h" // diff --git a/frame/1f/dotxf/bli_dotxf_kernel.c b/frame/1f/dotxf/bli_dotxf_kernel.c new file mode 100644 index 000000000..841ee7c6a --- /dev/null +++ b/frame/1f/dotxf/bli_dotxf_kernel.c @@ -0,0 +1,161 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T dotxf_fp + +typedef void (*FUNCPTR_T)( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + void* alpha, + void* a, inc_t inca, inc_t lda, + void* x, inc_t incx, + void* beta, + void* y, inc_t incy + ); + +// If some mixed datatype functions will not be compiled, we initialize +// the corresponding elements of the function array to NULL. +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_kernel_void); +#else +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_kernel_void); +#else +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_kernel_void); +#endif +#endif + + +void bli_dotxf_kernel( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y ) +{ + num_t dt_a = bli_obj_datatype( *a ); + num_t dt_x = bli_obj_datatype( *x ); + num_t dt_y = bli_obj_datatype( *y ); + + conj_t conjat = bli_obj_conj_status( *a ); + conj_t conjx = bli_obj_conj_status( *x ); + + dim_t m = bli_obj_vector_dim( *x ); + dim_t b_n = bli_obj_vector_dim( *y ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + inc_t rs_a = bli_obj_row_stride( *a ); + inc_t cs_a = bli_obj_col_stride( *a ); + + inc_t inc_x = bli_obj_vector_inc( *x ); + void* buf_x = bli_obj_buffer_at_off( *x ); + + inc_t inc_y = bli_obj_vector_inc( *y ); + void* buf_y = bli_obj_buffer_at_off( *y ); + + num_t dt_alpha; + void* buf_alpha; + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + + // The datatype of alpha MUST be the type union of a and x. This is to + // prevent any unnecessary loss of information during computation. + dt_alpha = bli_datatype_union( dt_a, dt_x ); + buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha ); + + // The datatype of beta MUST be the same as the datatype of y. + dt_beta = dt_y; + buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_a][dt_x][dt_y]; + + // Invoke the function. + f( conjat, + conjx, + m, + b_n, + buf_alpha, + buf_a, rs_a, cs_a, + buf_x, inc_x, + buf_beta, + buf_y, inc_y ); +} + + +#undef GENTFUNC3U12 +#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,varname)( \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + void* alpha, \ + void* a, inc_t inca, inc_t lda, \ + void* x, inc_t incx, \ + void* beta, \ + void* y, inc_t incy \ + ) \ +{ \ + PASTEMAC3(cha,chx,chy,kername)( conjat, \ + conjx, \ + m, \ + b_n, \ + alpha, \ + a, inca, lda, \ + x, incx, \ + beta, \ + y, incy ); \ +} + +// Define the basic set of functions unconditionally, and then also some +// mixed datatype functions if requested. +INSERT_GENTFUNC3U12_BASIC( dotxf_kernel_void, DOTXF_KERNEL ) + +#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT +INSERT_GENTFUNC3U12_MIX_D( dotxf_kernel_void, DOTXF_KERNEL ) +#endif + +#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT +INSERT_GENTFUNC3U12_MIX_P( dotxf_kernel_void, DOTXF_KERNEL ) +#endif + diff --git a/frame/1f/dotxf/bli_dotxf_unb_var1.h b/frame/1f/dotxf/bli_dotxf_kernel.h similarity index 86% rename from frame/1f/dotxf/bli_dotxf_unb_var1.h rename to frame/1f/dotxf/bli_dotxf_kernel.h index d32ba77b5..1821a22a7 100644 --- a/frame/1f/dotxf/bli_dotxf_unb_var1.h +++ b/frame/1f/dotxf/bli_dotxf_kernel.h @@ -32,13 +32,17 @@ */ -void bli_dotxf_unb_var1( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y ); +void bli_dotxf_kernel( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y ); +// +// Prototype the void pointer kernel wrappers. +// + #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ \ @@ -54,13 +58,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ void* y, inc_t incy \ ); -INSERT_GENTPROT3U12_BASIC( dotxf_unb_var1 ) +INSERT_GENTPROT3U12_BASIC( dotxf_kernel_void ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotxf_kernel_void ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxf_unb_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotxf_kernel_void ) #endif diff --git a/frame/1f/dotxf/bli_dotxf_unb_var1.c b/frame/1f/dotxf/bli_dotxf_ref.c similarity index 84% rename from frame/1f/dotxf/bli_dotxf_unb_var1.c rename to frame/1f/dotxf/bli_dotxf_ref.c index a4bb10a51..27b46756e 100644 --- a/frame/1f/dotxf/bli_dotxf_unb_var1.c +++ b/frame/1f/dotxf/bli_dotxf_ref.c @@ -34,6 +34,7 @@ #include "blis.h" +/* #define FUNCPTR_T dotxf_fp typedef void (*FUNCPTR_T)( @@ -51,17 +52,17 @@ typedef void (*FUNCPTR_T)( // If some mixed datatype functions will not be compiled, we initialize // the corresponding elements of the function array to NULL. #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_unb_var1); +static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxf_ref); #else #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_unb_var1); +static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxf_ref); #else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_unb_var1); +static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxf_ref); #endif #endif -void bli_dotxf_unb_var1( obj_t* alpha, +void bli_dotxf_ref( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, @@ -119,22 +120,24 @@ void bli_dotxf_unb_var1( obj_t* alpha, buf_beta, buf_y, inc_y ); } +*/ #undef GENTFUNC3U12 #define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy \ - ) \ +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict beta, \ + ctype_y* restrict y, inc_t incy \ + ) \ { \ ctype_ax* alpha_cast = alpha; \ ctype_a* a_cast = a; \ @@ -165,13 +168,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \ // Define the basic set of functions unconditionally, and then also some // mixed datatype functions if requested. -INSERT_GENTFUNC3U12_BASIC( dotxf_unb_var1, DOTXV_KERNEL ) +INSERT_GENTFUNC3U12_BASIC( dotxf_ref, DOTXV_KERNEL ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotxf_unb_var1, DOTXV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_D( dotxf_ref, DOTXV_KERNEL ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotxf_unb_var1, DOTXV_KERNEL ) +INSERT_GENTFUNC3U12_MIX_P( dotxf_ref, DOTXV_KERNEL ) #endif diff --git a/config/template/kernels/1f/bli_axpyf_opt_var1.h b/frame/1f/dotxf/bli_dotxf_ref.h similarity index 70% rename from config/template/kernels/1f/bli_axpyf_opt_var1.h rename to frame/1f/dotxf/bli_dotxf_ref.h index 927531805..6f03252d6 100644 --- a/config/template/kernels/1f/bli_axpyf_opt_var1.h +++ b/frame/1f/dotxf/bli_dotxf_ref.h @@ -32,31 +32,38 @@ */ +/* +void bli_dotxf_ref( obj_t* alpha, + obj_t* a, + obj_t* x, + obj_t* beta, + obj_t* y ); +*/ + -// -// Prototype kernel interfaces. -// #undef GENTPROT3U12 #define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ); +void PASTEMAC3(cha,chx,chy,varname) \ + ( \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict beta, \ + ctype_y* restrict y, inc_t incy \ + ); -INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 ) +INSERT_GENTPROT3U12_BASIC( dotxf_ref ) #ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 ) +INSERT_GENTPROT3U12_MIX_D( dotxf_ref ) #endif #ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 ) +INSERT_GENTPROT3U12_MIX_P( dotxf_ref ) #endif diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 841072a28..a89cd75dc 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -42,9 +42,13 @@ #include "bli_packm_unb_var1.h" #include "bli_packm_blk_var1.h" +#include "bli_packm_blk_var3.h" +#include "bli_packm_blk_var4.h" -#include "bli_packm_cxk.h" #include "bli_packm_gen_cxk.h" #include "bli_packm_herm_cxk.h" #include "bli_packm_tri_cxk.h" +#include "bli_packm_cxk.h" +#include "bli_packm_cxk_ri.h" +#include "bli_packm_cxk_ri3.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index aa0c0e56b..a5cc5aee4 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -361,8 +361,7 @@ void PASTEMAC(ch,varname )( \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ \ - /* NOTE: p_inc should be set to ps_p to properly support - BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \ + /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ } \ else \ @@ -386,8 +385,7 @@ void PASTEMAC(ch,varname )( \ c_begin, rs_c, cs_c, \ p_begin, rs_p, cs_p ); \ \ - /* NOTE: p_inc should be set to ps_p to properly support - BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \ + /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ldp * panel_len_max_i; \ } \ } \ diff --git a/frame/1m/packm/bli_packm_blk_var3.c b/frame/1m/packm/bli_packm_blk_var3.c new file mode 100644 index 000000000..302f97bc8 --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var3.c @@ -0,0 +1,446 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_fp + +typedef void (*FUNCPTR_T)( + struc_t strucc, + doff_t diagoffc, + diag_t diagc, + uplo_t uploc, + trans_t transc, + bool_t invdiag, + bool_t revifup, + bool_t reviflo, + dim_t m, + dim_t n, + dim_t m_max, + dim_t n_max, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p + ); + +//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3); + + +void bli_packm_blk_var3( obj_t* c, + obj_t* p ) +{ + num_t dt_cp = bli_obj_datatype( *c ); + + struc_t strucc = bli_obj_struc( *c ); + doff_t diagoffc = bli_obj_diag_offset( *c ); + diag_t diagc = bli_obj_diag( *c ); + uplo_t uploc = bli_obj_uplo( *c ); + trans_t transc = bli_obj_conjtrans_status( *c ); + bool_t invdiag = bli_obj_has_inverted_diag( *p ); + bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); + bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); + + dim_t m_p = bli_obj_length( *p ); + dim_t n_p = bli_obj_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); + + void* buf_c = bli_obj_buffer_at_off( *c ); + inc_t rs_c = bli_obj_row_stride( *c ); + inc_t cs_c = bli_obj_col_stride( *c ); + + void* buf_p = bli_obj_buffer_at_off( *p ); + inc_t rs_p = bli_obj_row_stride( *p ); + inc_t cs_p = bli_obj_col_stride( *p ); + dim_t pd_p = bli_obj_panel_dim( *p ); + inc_t ps_p = bli_obj_panel_stride( *p ); + + obj_t kappa; + obj_t* kappa_p; + void* buf_kappa; + + FUNCPTR_T f; + + + // We want this variant to behave identically to that of variant 1 + // in the real domain. + if ( bli_is_real( dt_cp ) ) + { + bli_packm_blk_var1( c, p, &BLIS_SINGLE_THREADED ); + return; + } + + // The value for kappa we use will depend on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing complex domain micro-kernels in terms of their + // real domain counterparts. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + + + // Acquire the buffer to the kappa chosen above. + buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); + + + // Index into the type combination array to extract the correct + // function pointer. + //f = ftypes[dt_cp]; + if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var3; + else f = bli_zpackm_blk_var3; + + // Invoke the function. + f( strucc, + diagoffc, + diagc, + uploc, + transc, + invdiag, + revifup, + reviflo, + m_p, + n_p, + m_max_p, + n_max_p, + buf_kappa, + buf_c, rs_c, cs_c, + buf_p, rs_p, cs_p, + pd_p, ps_p ); +} + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + bool_t invdiag, \ + bool_t revifup, \ + bool_t reviflo, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ + ctype* restrict c_begin; \ + ctype* restrict p_begin; \ +\ + dim_t iter_dim; \ + dim_t num_iter; \ + dim_t it, ic, ip; \ + dim_t ic0, ip0; \ + doff_t ic_inc, ip_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ + dim_t panel_len_full; \ + dim_t panel_len_i; \ + dim_t panel_len_max; \ + dim_t panel_len_max_i; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + dim_t panel_off_i; \ + inc_t vs_c; \ + inc_t ldc; \ + inc_t ldp, p_inc; \ + dim_t* m_panel_full; \ + dim_t* n_panel_full; \ + dim_t* m_panel_use; \ + dim_t* n_panel_use; \ + dim_t* m_panel_max; \ + dim_t* n_panel_max; \ + conj_t conjc; \ +\ + ctype* restrict c_use; \ + ctype* restrict p_use; \ + doff_t diagoffp_i; \ +\ +\ + /* If C is zeros and part of a triangular matrix, then we don't need + to pack it. */ \ + if ( bli_is_zeros( uploc ) && \ + bli_is_triangular( strucc ) ) return; \ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* If c needs a transposition, induce it so that we can more simply + express the remaining parameters and code. */ \ + if ( bli_does_trans( transc ) ) \ + { \ + bli_swap_incs( rs_c, cs_c ); \ + bli_negate_diag_offset( diagoffc ); \ + bli_toggle_uplo( uploc ); \ + bli_toggle_trans( transc ); \ + } \ +\ + /* If the strides of P indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len_full = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + ldc = rs_c; \ + vs_c = cs_c; \ + diagoffc_inc = -( doff_t)panel_dim_max; \ + ldp = rs_p; \ + m_panel_full = &m; \ + n_panel_full = &panel_dim_i; \ + m_panel_use = &panel_len_i; \ + n_panel_use = &panel_dim_i; \ + m_panel_max = &panel_len_max_i; \ + n_panel_max = &panel_dim_max; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len_full = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + ldc = cs_c; \ + vs_c = rs_c; \ + diagoffc_inc = ( doff_t )panel_dim_max; \ + ldp = cs_p; \ + m_panel_full = &panel_dim_i; \ + n_panel_full = &n; \ + m_panel_use = &panel_dim_i; \ + n_panel_use = &panel_len_i; \ + m_panel_max = &panel_dim_max; \ + n_panel_max = &panel_len_max_i; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ + ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ + { \ + ic0 = (num_iter - 1) * panel_dim_max; \ + ic_inc = -panel_dim_max; \ + ip0 = num_iter - 1; \ + ip_inc = -1; \ + } \ + else \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + ip0 = 0; \ + ip_inc = 1; \ + } \ +\ + p_begin = p_cast; \ +\ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ + { \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ + c_begin = c_cast + (ic )*vs_c; \ +\ + if ( bli_is_triangular( strucc ) && \ + bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ + { \ + /* This case executes if the panel belongs to a triangular + matrix AND is completely unstored (ie: zero). If the panel + is unstored, we do nothing. (Notice that we don't even + increment p_begin.) */ \ +\ + continue; \ + } \ + else if ( bli_is_triangular( strucc ) && \ + bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ + { \ + /* This case executes if the panel belongs to a triangular + matrix AND is diagonal-intersecting. Notice that we + cannot bury the following conditional logic into + packm_tri_cxk() because we need to know the value of + panel_len_max_i so we can properly increment p_inc. */ \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ + ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + { \ + panel_off_i = 0; \ + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ + panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ + diagoffp_i = diagoffc_i; \ + } \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + { \ + panel_off_i = bli_abs( diagoffc_i ); \ + panel_len_i = panel_len_full - panel_off_i; \ + panel_len_max_i = panel_len_max - panel_off_i; \ + diagoffp_i = 0; \ + } \ +\ + c_use = c_begin + (panel_off_i )*ldc; \ + p_use = p_begin; \ +\ + PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \ + diagoffp_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ +\ +\ + p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ +\ +/* + if ( cs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +\ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* This case executes if the panel belongs to a Hermitian or + symmetric matrix, which includes stored, unstored, and + diagonal-intersecting panels. */ \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \ + diagoffc_i, \ + uploc, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ +\ + /* NOTE: This value is equivalent to ps_p. */ \ + p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ + } \ + else \ + { \ + /* This case executes if the panel is general, or, if the + panel is part of a triangular matrix and is neither unstored + (ie: zero) nor diagonal-intersecting. */ \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \ + 0, \ + BLIS_DENSE, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ +\ + /* NOTE: This value is equivalent to ps_p. */ \ + p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \ +\ +/* + if ( cs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +\ + } \ +\ + p_begin += p_inc; \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_blk_var3 ) + diff --git a/frame/1m/packm/bli_packm_blk_var3.h b/frame/1m/packm/bli_packm_blk_var3.h new file mode 100644 index 000000000..6189d2415 --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var3.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_packm_blk_var3( obj_t* c, + obj_t* p ); + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + bool_t invdiag, \ + bool_t revifup, \ + bool_t reviflo, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_blk_var3 ) + diff --git a/frame/1m/packm/bli_packm_blk_var4.c b/frame/1m/packm/bli_packm_blk_var4.c new file mode 100644 index 000000000..2e83d9b1e --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var4.c @@ -0,0 +1,461 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_fp + +typedef void (*FUNCPTR_T)( + struc_t strucc, + doff_t diagoffc, + diag_t diagc, + uplo_t uploc, + trans_t transc, + bool_t invdiag, + bool_t revifup, + bool_t reviflo, + dim_t m, + dim_t n, + dim_t m_max, + dim_t n_max, + void* kappa, + void* c, inc_t rs_c, inc_t cs_c, + void* p, inc_t rs_p, inc_t cs_p, + dim_t pd_p, inc_t ps_p + ); + +//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4); + + +void bli_packm_blk_var4( obj_t* c, + obj_t* p ) +{ + num_t dt_cp = bli_obj_datatype( *c ); + + struc_t strucc = bli_obj_struc( *c ); + doff_t diagoffc = bli_obj_diag_offset( *c ); + diag_t diagc = bli_obj_diag( *c ); + uplo_t uploc = bli_obj_uplo( *c ); + trans_t transc = bli_obj_conjtrans_status( *c ); + bool_t invdiag = bli_obj_has_inverted_diag( *p ); + bool_t revifup = bli_obj_is_pack_rev_if_upper( *p ); + bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p ); + + dim_t m_p = bli_obj_length( *p ); + dim_t n_p = bli_obj_width( *p ); + dim_t m_max_p = bli_obj_padded_length( *p ); + dim_t n_max_p = bli_obj_padded_width( *p ); + + void* buf_c = bli_obj_buffer_at_off( *c ); + inc_t rs_c = bli_obj_row_stride( *c ); + inc_t cs_c = bli_obj_col_stride( *c ); + + void* buf_p = bli_obj_buffer_at_off( *p ); + inc_t rs_p = bli_obj_row_stride( *p ); + inc_t cs_p = bli_obj_col_stride( *p ); + dim_t pd_p = bli_obj_panel_dim( *p ); + inc_t ps_p = bli_obj_panel_stride( *p ); + + obj_t kappa; + obj_t* kappa_p; + void* buf_kappa; + + FUNCPTR_T f; + + + // We want this variant to behave identically to that of variant 1 + // in the real domain. + if ( bli_is_real( dt_cp ) ) + { + bli_packm_blk_var1( c, p, &BLIS_SINGLE_THREADED ); + return; + } + + // The value for kappa we use will depend on whether the scalar + // attached to A has a nonzero imaginary component. If it does, + // then we will apply the scalar during packing to facilitate + // implementing complex domain micro-kernels in terms of their + // real domain counterparts. (In the aforementioned situation, + // applying a real scalar is easy, but applying a complex one is + // harder, so we avoid the need altogether with the code below.) + if ( bli_obj_scalar_has_nonzero_imag( p ) ) + { + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; + } + + + // Acquire the buffer to the kappa chosen above. + buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); + + + // Index into the type combination array to extract the correct + // function pointer. + //f = ftypes[dt_cp]; + if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var4; + else f = bli_zpackm_blk_var4; + + // Invoke the function. + f( strucc, + diagoffc, + diagc, + uploc, + transc, + invdiag, + revifup, + reviflo, + m_p, + n_p, + m_max_p, + n_max_p, + buf_kappa, + buf_c, rs_c, cs_c, + buf_p, rs_p, cs_p, + pd_p, ps_p ); +} + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + bool_t invdiag, \ + bool_t revifup, \ + bool_t reviflo, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ + ) \ +{ \ + ctype* restrict kappa_cast = kappa; \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ + ctype* restrict c_begin; \ + ctype* restrict p_begin; \ +\ + dim_t iter_dim; \ + dim_t num_iter; \ + dim_t it, ic, ip; \ + dim_t ic0, ip0; \ + doff_t ic_inc, ip_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ + dim_t panel_len_full; \ + dim_t panel_len_i; \ + dim_t panel_len_max; \ + dim_t panel_len_max_i; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + dim_t panel_off_i; \ + inc_t vs_c; \ + inc_t ldc; \ + inc_t ldp, p_inc; \ + dim_t* m_panel_full; \ + dim_t* n_panel_full; \ + dim_t* m_panel_use; \ + dim_t* n_panel_use; \ + dim_t* m_panel_max; \ + dim_t* n_panel_max; \ + conj_t conjc; \ +\ + ctype* restrict c_use; \ + ctype* restrict p_use; \ + doff_t diagoffp_i; \ +\ +\ + /* If C is zeros and part of a triangular matrix, then we don't need + to pack it. */ \ + if ( bli_is_zeros( uploc ) && \ + bli_is_triangular( strucc ) ) return; \ +\ + /* Extract the conjugation bit from the transposition argument. */ \ + conjc = bli_extract_conj( transc ); \ +\ + /* If c needs a transposition, induce it so that we can more simply + express the remaining parameters and code. */ \ + if ( bli_does_trans( transc ) ) \ + { \ + bli_swap_incs( rs_c, cs_c ); \ + bli_negate_diag_offset( diagoffc ); \ + bli_toggle_uplo( uploc ); \ + bli_toggle_trans( transc ); \ + } \ +\ + /* If the strides of P indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panels. */ \ + iter_dim = n; \ + panel_len_full = m; \ + panel_len_max = m_max; \ + panel_dim_max = pd_p; \ + ldc = rs_c; \ + vs_c = cs_c; \ + diagoffc_inc = -( doff_t)panel_dim_max; \ + ldp = rs_p; \ + m_panel_full = &m; \ + n_panel_full = &panel_dim_i; \ + m_panel_use = &panel_len_i; \ + n_panel_use = &panel_dim_i; \ + m_panel_max = &panel_len_max_i; \ + n_panel_max = &panel_dim_max; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panels. */ \ + iter_dim = m; \ + panel_len_full = n; \ + panel_len_max = n_max; \ + panel_dim_max = pd_p; \ + ldc = cs_c; \ + vs_c = rs_c; \ + diagoffc_inc = ( doff_t )panel_dim_max; \ + ldp = cs_p; \ + m_panel_full = &panel_dim_i; \ + n_panel_full = &n; \ + m_panel_use = &panel_dim_i; \ + n_panel_use = &panel_len_i; \ + m_panel_max = &panel_dim_max; \ + n_panel_max = &panel_len_max_i; \ + } \ +\ + /* Compute the total number of iterations we'll need. */ \ + num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ +\ + /* Set the initial values and increments for indices related to C and P + based on whether reverse iteration was requested. */ \ + if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \ + ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \ + { \ + ic0 = (num_iter - 1) * panel_dim_max; \ + ic_inc = -panel_dim_max; \ + ip0 = num_iter - 1; \ + ip_inc = -1; \ + } \ + else \ + { \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + ip0 = 0; \ + ip_inc = 1; \ + } \ +\ + p_begin = p_cast; \ +\ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ + { \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ + c_begin = c_cast + (ic )*vs_c; \ +\ + if ( bli_is_triangular( strucc ) && \ + bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \ + { \ + /* This case executes if the panel belongs to a triangular + matrix AND is completely unstored (ie: zero). If the panel + is unstored, we do nothing. (Notice that we don't even + increment p_begin.) */ \ +\ + continue; \ + } \ + else if ( bli_is_triangular( strucc ) && \ + bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \ + { \ + /* This case executes if the panel belongs to a triangular + matrix AND is diagonal-intersecting. Notice that we + cannot bury the following conditional logic into + packm_tri_cxk() because we need to know the value of + panel_len_max_i so we can properly increment p_inc. */ \ +\ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \ + ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + { \ + panel_off_i = 0; \ + panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \ + panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \ + diagoffp_i = diagoffc_i; \ + } \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + { \ + panel_off_i = bli_abs( diagoffc_i ); \ + panel_len_i = panel_len_full - panel_off_i; \ + panel_len_max_i = panel_len_max - panel_off_i; \ + diagoffp_i = 0; \ + } \ +\ + c_use = c_begin + (panel_off_i )*ldc; \ + p_use = p_begin; \ +\ + PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \ + diagoffp_i, \ + diagc, \ + uploc, \ + conjc, \ + invdiag, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_use, rs_c, cs_c, \ + p_use, rs_p, cs_p ); \ +\ + p_inc = ldp * panel_len_max_i; \ +\ +/* + if ( rs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +/* + if ( cs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +\ + } \ + else if ( bli_is_herm_or_symm( strucc ) ) \ + { \ + /* This case executes if the panel belongs to a Hermitian or + symmetric matrix, which includes stored, unstored, and + diagonal-intersecting panels. */ \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \ + diagoffc_i, \ + uploc, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ +\ + /* NOTE: This value is equivalent to ps_p. */ \ + p_inc = ldp * panel_len_max_i; \ + } \ + else \ + { \ + /* This case executes if the panel is general, or, if the + panel is part of a triangular matrix and is neither unstored + (ie: zero) nor diagonal-intersecting. */ \ +\ + panel_len_i = panel_len_full; \ + panel_len_max_i = panel_len_max; \ +\ + PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \ + 0, \ + BLIS_DENSE, \ + conjc, \ + *m_panel_use, \ + *n_panel_use, \ + *m_panel_max, \ + *n_panel_max, \ + kappa_cast, \ + c_begin, rs_c, cs_c, \ + p_begin, rs_p, cs_p ); \ +\ + /* NOTE: This value is equivalent to ps_p. */ \ + p_inc = ldp * panel_len_max_i; \ +\ +/* + if ( cs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +/* + if ( rs_p == 1 ) { \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \ + ( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \ + } \ +*/ \ +\ + } \ +\ + p_begin += p_inc; \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_blk_var4 ) + diff --git a/frame/1m/packm/bli_packm_blk_var4.h b/frame/1m/packm/bli_packm_blk_var4.h new file mode 100644 index 000000000..e13e5fe33 --- /dev/null +++ b/frame/1m/packm/bli_packm_blk_var4.h @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_packm_blk_var4( obj_t* c, + obj_t* p ); + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + bool_t invdiag, \ + bool_t revifup, \ + bool_t reviflo, \ + dim_t m, \ + dim_t n, \ + dim_t m_max, \ + dim_t n_max, \ + void* kappa, \ + void* c, inc_t rs_c, inc_t cs_c, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_blk_var4 ) + diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/packm/bli_packm_cxk.c index 4e2df3cbe..53074008f 100644 --- a/frame/1m/packm/bli_packm_cxk.c +++ b/frame/1m/packm/bli_packm_cxk.c @@ -39,7 +39,7 @@ typedef void (*FUNCPTR_T)( conj_t conja, dim_t n, - void* beta, + void* kappa, void* a, inc_t inca, inc_t lda, void* p, inc_t ldp ); @@ -47,119 +47,106 @@ typedef void (*FUNCPTR_T)( #undef FUNCPTR_ARRAY_LENGTH #define FUNCPTR_ARRAY_LENGTH 18 -#undef GENARRAY -#define GENARRAY( kername2, kername4, kername6, kername8, \ - kername10, kername12, kername14, kername16 ) \ -\ -static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \ -{ \ - /* panel width = 0 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 1 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 2 */ \ - { \ - PASTEMAC(s,kername2), \ - PASTEMAC(c,kername2), \ - PASTEMAC(d,kername2), \ - PASTEMAC(z,kername2), \ - }, \ - /* panel width = 3 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 4 */ \ - { \ - PASTEMAC(s,kername4), \ - PASTEMAC(c,kername4), \ - PASTEMAC(d,kername4), \ - PASTEMAC(z,kername4), \ - }, \ - /* panel width = 5 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 6 */ \ - { \ - PASTEMAC(s,kername6), \ - PASTEMAC(c,kername6), \ - PASTEMAC(d,kername6), \ - PASTEMAC(z,kername6), \ - }, \ - /* panel width = 7 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 8 */ \ - { \ - PASTEMAC(s,kername8), \ - PASTEMAC(c,kername8), \ - PASTEMAC(d,kername8), \ - PASTEMAC(z,kername8), \ - }, \ - /* panel width = 9 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 10 */ \ - { \ - PASTEMAC(s,kername10), \ - PASTEMAC(c,kername10), \ - PASTEMAC(d,kername10), \ - PASTEMAC(z,kername10), \ - }, \ - /* panel width = 11 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 12 */ \ - { \ - PASTEMAC(s,kername12), \ - PASTEMAC(c,kername12), \ - PASTEMAC(d,kername12), \ - PASTEMAC(z,kername12), \ - }, \ - /* panel width = 13 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 14 */ \ - { \ - PASTEMAC(s,kername14), \ - PASTEMAC(c,kername14), \ - PASTEMAC(d,kername14), \ - PASTEMAC(z,kername14), \ - }, \ - /* panel width = 15 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 16 */ \ - { \ - PASTEMAC(s,kername16), \ - PASTEMAC(c,kername16), \ - PASTEMAC(d,kername16), \ - PASTEMAC(z,kername16), \ - }, \ - /* panel width = 17 */ \ - { \ - NULL, NULL, NULL, NULL, \ - } \ +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 2 */ + { + BLIS_SPACKM_2XK_KERNEL, + BLIS_CPACKM_2XK_KERNEL, + BLIS_DPACKM_2XK_KERNEL, + BLIS_ZPACKM_2XK_KERNEL, + }, + /* panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 4 */ + { + BLIS_SPACKM_4XK_KERNEL, + BLIS_CPACKM_4XK_KERNEL, + BLIS_DPACKM_4XK_KERNEL, + BLIS_ZPACKM_4XK_KERNEL, + }, + /* panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 6 */ + { + BLIS_SPACKM_6XK_KERNEL, + BLIS_CPACKM_6XK_KERNEL, + BLIS_DPACKM_6XK_KERNEL, + BLIS_ZPACKM_6XK_KERNEL, + }, + /* panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 8 */ + { + BLIS_SPACKM_8XK_KERNEL, + BLIS_CPACKM_8XK_KERNEL, + BLIS_DPACKM_8XK_KERNEL, + BLIS_ZPACKM_8XK_KERNEL, + }, + /* panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 10 */ + { + BLIS_SPACKM_10XK_KERNEL, + BLIS_CPACKM_10XK_KERNEL, + BLIS_DPACKM_10XK_KERNEL, + BLIS_ZPACKM_10XK_KERNEL, + }, + /* panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 12 */ + { + BLIS_SPACKM_12XK_KERNEL, + BLIS_CPACKM_12XK_KERNEL, + BLIS_DPACKM_12XK_KERNEL, + BLIS_ZPACKM_12XK_KERNEL, + }, + /* panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 14 */ + { + BLIS_SPACKM_14XK_KERNEL, + BLIS_CPACKM_14XK_KERNEL, + BLIS_DPACKM_14XK_KERNEL, + BLIS_ZPACKM_14XK_KERNEL, + }, + /* panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 16 */ + { + BLIS_SPACKM_16XK_KERNEL, + BLIS_CPACKM_16XK_KERNEL, + BLIS_DPACKM_16XK_KERNEL, + BLIS_ZPACKM_16XK_KERNEL, + }, + /* panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, }; -GENARRAY( PACKM_2XK_KERNEL, - PACKM_4XK_KERNEL, - PACKM_6XK_KERNEL, - PACKM_8XK_KERNEL, - PACKM_10XK_KERNEL, - PACKM_12XK_KERNEL, - PACKM_14XK_KERNEL, - PACKM_16XK_KERNEL ) - #undef GENTFUNC @@ -169,7 +156,7 @@ void PASTEMAC(ch,opname)( \ conj_t conja, \ dim_t m, \ dim_t n, \ - void* beta, \ + void* kappa, \ void* a, inc_t inca, inc_t lda, \ void* p, inc_t ldp \ ) \ @@ -184,7 +171,7 @@ void PASTEMAC(ch,opname)( \ { \ PASTEMAC3(ch,ch,ch,scal2vker)( conja, \ n, \ - beta, \ + kappa, \ a, lda, \ p, ldp ); \ return; \ @@ -209,7 +196,7 @@ void PASTEMAC(ch,opname)( \ { \ f( conja, \ n, \ - beta, \ + kappa, \ a, inca, lda, \ p, ldp ); \ } \ @@ -222,7 +209,7 @@ void PASTEMAC(ch,opname)( \ conja, \ m, \ n, \ - beta, \ + kappa, \ a, inca, lda, \ p, 1, ldp ); \ } \ diff --git a/frame/1m/packm/bli_packm_cxk_ri.c b/frame/1m/packm/bli_packm_cxk_ri.c new file mode 100644 index 000000000..fecac4fc6 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_ri.c @@ -0,0 +1,253 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_fp + +typedef void (*FUNCPTR_T)( + conj_t conja, + dim_t n, + void* kappa, + void* a, inc_t inca, inc_t lda, + void* p, inc_t psp, inc_t ldp + ); + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 + +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 2 */ + { + NULL, + BLIS_CPACKM_2XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_2XK_RI_KERNEL, + }, + /* panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 4 */ + { + NULL, + BLIS_CPACKM_4XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_4XK_RI_KERNEL, + }, + /* panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 6 */ + { + NULL, + BLIS_CPACKM_6XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_6XK_RI_KERNEL, + }, + /* panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 8 */ + { + NULL, + BLIS_CPACKM_8XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_8XK_RI_KERNEL, + }, + /* panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 10 */ + { + NULL, + BLIS_CPACKM_10XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_10XK_RI_KERNEL, + }, + /* panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 12 */ + { + NULL, + BLIS_CPACKM_12XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_12XK_RI_KERNEL, + }, + /* panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 14 */ + { + NULL, + BLIS_CPACKM_14XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_14XK_RI_KERNEL, + }, + /* panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 16 */ + { + NULL, + BLIS_CPACKM_16XK_RI_KERNEL, + NULL, + BLIS_ZPACKM_16XK_RI_KERNEL, + }, + /* panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t i, j; \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* The panel dimension is always equal to the m dimension of p. */ \ + panel_dim = m; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f( conja, \ + n, \ + kappa, \ + a, inca, lda, \ + p, psp, ldp ); \ + } \ + else \ + { \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + 1; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype_r* restrict p_i = ( ctype_r* )p + psp; \ + dim_t inca2 = 2*inca; \ + dim_t lda2 = 2*lda; \ +\ + /* Treat the panel as m x n and column-stored (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2jris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2ris)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_cxk_ri ) + diff --git a/kernels/x86_64/piledriver/3/bli_gemm_4x6.h b/frame/1m/packm/bli_packm_cxk_ri.h similarity index 70% rename from kernels/x86_64/piledriver/3/bli_gemm_4x6.h rename to frame/1m/packm/bli_packm_cxk_ri.h index bf7a9cae9..57d9afd15 100644 --- a/kernels/x86_64/piledriver/3/bli_gemm_4x6.h +++ b/frame/1m/packm/bli_packm_cxk_ri.h @@ -32,21 +32,28 @@ */ -#include "blis.h" +// Include headers for various packm kernels. +#include "bli_packm_ref_2xk.h" +#include "bli_packm_ref_4xk.h" +#include "bli_packm_ref_6xk.h" +#include "bli_packm_ref_8xk.h" +#include "bli_packm_ref_10xk.h" +#include "bli_packm_ref_12xk.h" +#include "bli_packm_ref_14xk.h" +#include "bli_packm_ref_16xk.h" -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROT_BASIC( gemm_4x6 ) +INSERT_GENTPROTCO_BASIC( packm_cxk_ri ) diff --git a/frame/1m/packm/bli_packm_cxk_ri3.c b/frame/1m/packm/bli_packm_cxk_ri3.c new file mode 100644 index 000000000..022b26454 --- /dev/null +++ b/frame/1m/packm/bli_packm_cxk_ri3.c @@ -0,0 +1,257 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T packm_cxk_fp + +typedef void (*FUNCPTR_T)( + conj_t conja, + dim_t n, + void* kappa, + void* a, inc_t inca, inc_t lda, + void* p, inc_t psp, inc_t ldp + ); + +#undef FUNCPTR_ARRAY_LENGTH +#define FUNCPTR_ARRAY_LENGTH 18 + +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 2 */ + { + NULL, + BLIS_CPACKM_2XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_2XK_RI3_KERNEL, + }, + /* panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 4 */ + { + NULL, + BLIS_CPACKM_4XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_4XK_RI3_KERNEL, + }, + /* panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 6 */ + { + NULL, + BLIS_CPACKM_6XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_6XK_RI3_KERNEL, + }, + /* panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 8 */ + { + NULL, + BLIS_CPACKM_8XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_8XK_RI3_KERNEL, + }, + /* panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 10 */ + { + NULL, + BLIS_CPACKM_10XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_10XK_RI3_KERNEL, + }, + /* panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 12 */ + { + NULL, + BLIS_CPACKM_12XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_12XK_RI3_KERNEL, + }, + /* panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 14 */ + { + NULL, + BLIS_CPACKM_14XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_14XK_RI3_KERNEL, + }, + /* panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 16 */ + { + NULL, + BLIS_CPACKM_16XK_RI3_KERNEL, + NULL, + BLIS_ZPACKM_16XK_RI3_KERNEL, + }, + /* panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, +}; + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, scal2vker ) \ +\ +void PASTEMAC(ch,opname)( \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + void* kappa, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + dim_t panel_dim; \ + dim_t i, j; \ + num_t dt; \ + FUNCPTR_T f; \ +\ + /* The panel dimension is always equal to the m dimension of p. */ \ + panel_dim = m; \ +\ + /* Acquire the datatype for the current function. */ \ + dt = PASTEMAC(ch,type); \ +\ + /* Index into the array to extract the correct function pointer. + If the panel dimension is too big to be within the array of + explicitly handled kernels, then we treat that kernel the same + as if it were in range but unimplemented. */ \ + if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \ + else f = NULL; \ +\ + /* If there exists a kernel implementation for the panel dimension + provided, we invoke the implementation. Otherwise, we use scal2m. */ \ + if ( f != NULL ) \ + { \ + f( conja, \ + n, \ + kappa, \ + a, inca, lda, \ + p, psp, ldp ); \ + } \ + else \ + { \ + ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ + ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + 1; \ + ctype_r* restrict p_r = ( ctype_r* )p; \ + ctype_r* restrict p_i = ( ctype_r* )p + psp; \ + ctype_r* restrict p_ri = ( ctype_r* )p + 2*psp; \ + dim_t inca2 = 2*inca; \ + dim_t lda2 = 2*lda; \ +\ + /* Treat the panel as m x n and column-stored (unit row stride). */ \ +\ + /* NOTE: The loops below are inlined versions of scal2m, but + for separated real/imaginary storage. */ \ +\ + if ( bli_is_conj( conja ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2jri3s)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i, \ + *pi11_ri ); \ + } \ + } \ + } \ + else /* if ( bli_is_noconj( conja ) ) */ \ + { \ + for ( j = 0; j < n; ++j ) \ + { \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \ + ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \ + ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \ +\ + PASTEMAC(ch,scal2ri3s)( *kappa_r, \ + *kappa_i, \ + *alpha11_r, \ + *alpha11_i, \ + *pi11_r, \ + *pi11_i, \ + *pi11_ri ); \ + } \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( packm_cxk_ri3, SCAL2V_KERNEL ) + diff --git a/config/template/kernels/3/bli_gemm_opt_mxn.h b/frame/1m/packm/bli_packm_cxk_ri3.h similarity index 70% rename from config/template/kernels/3/bli_gemm_opt_mxn.h rename to frame/1m/packm/bli_packm_cxk_ri3.h index 8b0094ad6..0c782252f 100644 --- a/config/template/kernels/3/bli_gemm_opt_mxn.h +++ b/frame/1m/packm/bli_packm_cxk_ri3.h @@ -32,22 +32,28 @@ */ +// Include headers for various packm kernels. +#include "bli_packm_ref_2xk.h" +#include "bli_packm_ref_4xk.h" +#include "bli_packm_ref_6xk.h" +#include "bli_packm_ref_8xk.h" +#include "bli_packm_ref_10xk.h" +#include "bli_packm_ref_12xk.h" +#include "bli_packm_ref_14xk.h" +#include "bli_packm_ref_16xk.h" -// -// Prototype micro-kernel interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1, \ - ctype* restrict b1, \ - ctype* restrict beta, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ + conj_t conja, \ + dim_t m, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ ); -INSERT_GENTPROT_BASIC( gemm_opt_mxn ) +INSERT_GENTPROTCO_BASIC( packm_cxk_ri3 ) diff --git a/frame/1m/packm/bli_packm_gen_cxk.c b/frame/1m/packm/bli_packm_gen_cxk.c index fc170b967..27a7da2a2 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.c +++ b/frame/1m/packm/bli_packm_gen_cxk.c @@ -105,13 +105,13 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max; \ ctype* p_edge = p + (i )*rs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ @@ -121,16 +121,281 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max - j; \ ctype* p_edge = p + (j )*cs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ -\ } INSERT_GENTFUNC_BASIC0( packm_gen_cxk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk_ri)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk_ri3)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri3 ) + diff --git a/frame/1m/packm/bli_packm_gen_cxk.h b/frame/1m/packm/bli_packm_gen_cxk.h index 8e610c37e..1c0ba8398 100644 --- a/frame/1m/packm/bli_packm_gen_cxk.h +++ b/frame/1m/packm/bli_packm_gen_cxk.h @@ -51,3 +51,25 @@ void PASTEMAC(ch,varname)( \ INSERT_GENTPROT_BASIC( packm_gen_cxk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri ) + +INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri3 ) diff --git a/frame/1m/packm/bli_packm_herm_cxk.c b/frame/1m/packm/bli_packm_herm_cxk.c index a6ed0506f..3a82eeedb 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.c +++ b/frame/1m/packm/bli_packm_herm_cxk.c @@ -87,8 +87,8 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ { \ /* Prepare to pack to row-stored column panel. */ \ - panel_len = m_panel; \ panel_dim = n_panel; \ + panel_len = m_panel; \ incc = cs_c; \ ldc = rs_c; \ ldp = rs_p; \ @@ -98,8 +98,8 @@ void PASTEMAC(ch,varname)( \ else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ { \ /* Prepare to pack to column-stored row panel. */ \ - panel_len = n_panel; \ panel_dim = m_panel; \ + panel_len = n_panel; \ incc = rs_c; \ ldc = cs_c; \ ldp = cs_p; \ @@ -167,12 +167,6 @@ void PASTEMAC(ch,varname)( \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ -\ - p11_m = panel_dim; \ - p11_n = panel_dim; \ - j = diagoffc_abs; \ - p11 = p + (j )*ldp; \ - c11 = c + (j )*ldc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc12 ); \ @@ -199,19 +193,13 @@ void PASTEMAC(ch,varname)( \ incc12 = incc; \ ldc12 = ldc; \ conjc12 = conjc; \ -\ - p11_m = panel_dim; \ - p11_n = panel_dim; \ - j = diagoffc_abs; \ - p11 = p + (j )*ldp; \ - c11 = c + (j )*ldc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( conjc10 ); \ } \ \ - /* Pack to P10. For upper storage, this includes the unstored - triangle of C11. */ \ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ PASTEMAC(ch,packm_cxk)( conjc10, \ p10_dim, \ p10_len, \ @@ -219,8 +207,8 @@ void PASTEMAC(ch,varname)( \ c10, incc10, ldc10, \ p10, ldp ); \ \ - /* Pack to P12. For lower storage, this includes the unstored - triangle of C11. */ \ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ PASTEMAC(ch,packm_cxk)( conjc12, \ p12_dim, \ p12_len, \ @@ -228,29 +216,37 @@ void PASTEMAC(ch,varname)( \ c12, incc12, ldc12, \ p12, ldp ); \ \ - /* Pack the stored triangule of C11 to P11. */ \ - PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - uploc, \ - conjc, \ - p11_m, \ - p11_n, \ - kappa, \ - c11, rs_c, cs_c, \ - p11, rs_p11, cs_p11 ); \ -\ - /* If source matrix C is Hermitian, we have to zero out the - imaginary components of the diagonal of P11 in case the - corresponding elements in C11 were not already zero. */ \ - if ( bli_is_hermitian( strucc ) ) \ + /* Pack the stored triangle of c11 to p11. */ \ { \ - /* NOTE: We can directly increment p11 since we are done - using p11 for the remainder of the function. */ \ - for ( i = 0; i < p11_m; ++i ) \ - { \ - PASTEMAC(ch,seti0s)( *p11 ); \ + p11_m = panel_dim; \ + p11_n = panel_dim; \ + j = diagoffc_abs; \ + p11 = p + (j )*ldp; \ + c11 = c + (j )*ldc; \ \ - p11 += rs_p11 + cs_p11; \ + PASTEMAC(ch,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + conjc, \ + p11_m, \ + p11_n, \ + kappa, \ + c11, rs_c, cs_c, \ + p11, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + /* NOTE: We can directly increment p11 since we are done + using p11 for the remainder of the function. */ \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + PASTEMAC(ch,seti0s)( *p11 ); \ +\ + p11 += rs_p11 + cs_p11; \ + } \ } \ } \ } \ @@ -269,13 +265,13 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max; \ ctype* p_edge = p + (i )*rs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ @@ -285,16 +281,721 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max - j; \ ctype* p_edge = p + (j )*cs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ -\ } INSERT_GENTFUNC_BASIC0( packm_herm_cxk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + dim_t i, j; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + doff_t diagoffc_abs; \ + dim_t panel_dim; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + inc_t rs_p11, cs_p11; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,packm_cxk_ri)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,packm_cxk_ri)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, psp, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,packm_cxk_ri)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, psp, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + inc_t rs_c11 = 2*rs_c; \ + inc_t cs_c11 = 2*cs_c; \ + dim_t j = diagoffc_abs; \ + ctype* c11 = ( ctype* )c + (j )*ldc; \ + ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ + ctype_r* c11_r = ( ctype_r* )c11; \ + ctype_r* c11_i = ( ctype_r* )c11 + 1; \ + ctype_r* p11_r = ( ctype_r* )p11; \ + ctype_r* p11_i = ( ctype_r* )p11 + psp; \ + ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \ + ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \ + ctype_r* alpha_r = one_r; \ + ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ +\ + /* Copy the real part of the stored triangle of c11 to p11_r. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_r, \ + c11_r, rs_c11, cs_c11, \ + p11_r, rs_p11, cs_p11 ); \ +\ + /* Copy the imaginary part of the stored triangle of c11 to p11_i, + scaling by -1 if conjugation on c was requested. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_i, \ + c11_i, rs_c11, cs_c11, \ + p11_i, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ +\ + PASTEMAC(chr,set0s)( *pi11_i ); \ + } \ + } \ +\ + /* Apply kappa to the part of p11 that corresponds to the stored + part of c11 that was copied above. */ \ + if ( bli_is_upper( uploc ) ) \ + { \ + PASTEMAC(ch,scalris_mxn_u)( 0, \ + p11_m, \ + p11_n, \ + kappa_r, \ + kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + else \ + { \ + PASTEMAC(ch,scalris_mxn_l)( 0, \ + p11_m, \ + p11_n, \ + kappa_r, \ + kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ + p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ + p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ + } \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* restrict p_r = ( ctype_r* )p; \ +\ + dim_t i, j; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + doff_t diagoffc_abs; \ + dim_t panel_dim; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ + ctype* restrict c10; \ + ctype_r* restrict p10; \ + dim_t p10_dim, p10_len; \ + inc_t incc10, ldc10; \ + doff_t diagoffc10; \ + conj_t conjc10; \ +\ + ctype* restrict c12; \ + ctype_r* restrict p12; \ + dim_t p12_dim, p12_len; \ + inc_t incc12, ldc12; \ + doff_t diagoffc12; \ + conj_t conjc12; \ +\ + inc_t rs_p11, cs_p11; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \ + { \ + /* If the current panel is unstored, we need to make a few + adjustments so we refer to the data where it is actually + stored, also taking conjugation into account. (Note this + implicitly assumes we are operating on a dense panel + within a larger symmetric or Hermitian matrix, since a + general matrix would not contain any unstored region.) */ \ + if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \ + { \ + c = c + diagoffc * ( doff_t )cs_c + \ + -diagoffc * ( doff_t )rs_c; \ + bli_swap_incs( incc, ldc ); \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc ); \ + } \ +\ + /* Pack the full panel. */ \ + PASTEMAC(ch,packm_cxk_ri3)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ + } \ + else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \ + { \ + /* Sanity check. Diagonals should not intersect the short end of + a micro-panel. If they do, then somehow the constraints on + cache blocksizes being a whole multiple of the register + blocksizes was somehow violated. */ \ + if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \ + ( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ + diagoffc_abs = bli_abs( diagoffc ); \ +\ + if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs; \ + p10 = p_r; \ + c10 = c; \ + incc10 = incc; \ + ldc10 = ldc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + diagoffc12 = diagoffc_abs - j; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + c12 = c12 + diagoffc12 * ( doff_t )cs_c + \ + -diagoffc12 * ( doff_t )rs_c; \ + incc12 = ldc; \ + ldc12 = incc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc12 ); \ + } \ + else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \ + ( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \ + { \ + p10_dim = panel_dim; \ + p10_len = diagoffc_abs + panel_dim; \ + diagoffc10 = diagoffc; \ + p10 = p_r; \ + c10 = c; \ + c10 = c10 + diagoffc10 * ( doff_t )cs_c + \ + -diagoffc10 * ( doff_t )rs_c; \ + incc10 = ldc; \ + ldc10 = incc; \ + conjc10 = conjc; \ +\ + p12_dim = panel_dim; \ + p12_len = panel_len - p10_len; \ + j = p10_len; \ + p12 = p_r + (j )*ldp; \ + c12 = c + (j )*ldc; \ + incc12 = incc; \ + ldc12 = ldc; \ + conjc12 = conjc; \ +\ + if ( bli_is_hermitian( strucc ) ) \ + bli_toggle_conj( conjc10 ); \ + } \ +\ + /* Pack to p10. For upper storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,packm_cxk_ri3)( conjc10, \ + p10_dim, \ + p10_len, \ + kappa, \ + c10, incc10, ldc10, \ + p10, psp, ldp ); \ +\ + /* Pack to p12. For lower storage, this includes the unstored + triangle of c11. */ \ + PASTEMAC(ch,packm_cxk_ri3)( conjc12, \ + p12_dim, \ + p12_len, \ + kappa, \ + c12, incc12, ldc12, \ + p12, psp, ldp ); \ +\ + /* Pack the stored triangle of c11 to p11. */ \ + { \ + dim_t p11_m = panel_dim; \ + dim_t p11_n = panel_dim; \ + inc_t rs_c11 = 2*rs_c; \ + inc_t cs_c11 = 2*cs_c; \ + dim_t j = diagoffc_abs; \ + ctype* c11 = ( ctype* )c + (j )*ldc; \ + ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \ + ctype_r* c11_r = ( ctype_r* )c11; \ + ctype_r* c11_i = ( ctype_r* )c11 + 1; \ + ctype_r* p11_r = ( ctype_r* )p11; \ + ctype_r* p11_i = ( ctype_r* )p11 + psp; \ + ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \ + ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \ + ctype_r* alpha_r = one_r; \ + ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \ +\ + /* Copy the real part of the stored triangle of c11 to p11_r. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_r, \ + c11_r, rs_c11, cs_c11, \ + p11_r, rs_p11, cs_p11 ); \ +\ + /* Copy the imaginary part of the stored triangle of c11 to p11_i, + scaling by -1 if conjugation on c was requested. */ \ + PASTEMAC(chr,scal2m)( 0, \ + BLIS_NONUNIT_DIAG, \ + uploc, \ + BLIS_NO_TRANSPOSE, \ + p11_m, \ + p11_n, \ + alpha_i, \ + c11_i, rs_c11, cs_c11, \ + p11_i, rs_p11, cs_p11 ); \ +\ + /* If source matrix c is Hermitian, we have to zero out the + imaginary components of the diagonal of p11 in case the + corresponding elements in c11 were not already zero. */ \ + if ( bli_is_hermitian( strucc ) ) \ + { \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \ +\ + PASTEMAC(chr,set0s)( *pi11_i ); \ + } \ + } \ +\ + /* Apply kappa to the part of p11 that corresponds to the stored + part of c11 that was copied above. */ \ + if ( bli_is_upper( uploc ) ) \ + { \ + PASTEMAC(ch,scalris_mxn_u)( 0, \ + p11_m, \ + p11_n, \ + kappa_r, \ + kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + else \ + { \ + PASTEMAC(ch,scalris_mxn_l)( 0, \ + p11_m, \ + p11_n, \ + kappa_r, \ + kappa_i, \ + p11_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +\ + /* Update the p11 section of the ri panel. It simply needs + to contain the sum of p11_r + p11_i. */ \ + { \ + ctype_r* p11_ri = p11_i + psp; \ +\ + for ( j = 0; j < p11_n; ++j ) \ + for ( i = 0; i < p11_m; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \ + ctype_r* pi11_ri = p11_ri + (i )*rs_p11 + (j )*cs_p11; \ +\ + PASTEMAC(chr,add3s)( *pi11_r, \ + *pi11_i, \ + *pi11_ri ); \ + } \ + } \ +/* + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \ + p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \ + PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \ + p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ + } \ + } \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri3 ) + diff --git a/frame/1m/packm/bli_packm_herm_cxk.h b/frame/1m/packm/bli_packm_herm_cxk.h index 1d1a43d2a..049b57c1b 100644 --- a/frame/1m/packm/bli_packm_herm_cxk.h +++ b/frame/1m/packm/bli_packm_herm_cxk.h @@ -51,3 +51,25 @@ void PASTEMAC(ch,varname)( \ INSERT_GENTPROT_BASIC( packm_herm_cxk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffc, \ + uplo_t uploc, \ + conj_t conjc, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri ) + +INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri3 ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 8441fe6fd..eac2a7a8e 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -303,7 +303,9 @@ void bli_packm_init_pack( bool_t densify, // Compute the size of the packed buffer. size_p = cs_p * n_p_pad * elem_size_p; } - else if ( pack_schema == BLIS_PACKED_ROW_PANELS ) + else if ( pack_schema == BLIS_PACKED_ROW_PANELS || + pack_schema == BLIS_PACKED_ROW_PANELS_4M || + pack_schema == BLIS_PACKED_ROW_PANELS_3M ) { dim_t m_panel; dim_t ps_p; @@ -331,11 +333,8 @@ void bli_packm_init_pack( bool_t densify, // dimension of the matrix is not a whole multiple of MR. ps_p = cs_p * n_p_pad; - // Align the panel dimension according to the contiguous memory - // stride alignment size so that the second, third, etc panels begin - // at aligned addresses. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, - BLIS_CONTIG_STRIDE_ALIGN_SIZE ); + if ( pack_schema == BLIS_PACKED_ROW_PANELS_3M ) + ps_p = ( ps_p * 3 ) / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); @@ -345,7 +344,9 @@ void bli_packm_init_pack( bool_t densify, // Compute the size of the packed buffer. size_p = ps_p * (m_p_pad / m_panel) * elem_size_p; } - else if ( pack_schema == BLIS_PACKED_COL_PANELS ) + else if ( pack_schema == BLIS_PACKED_COL_PANELS || + pack_schema == BLIS_PACKED_COL_PANELS_4M || + pack_schema == BLIS_PACKED_COL_PANELS_3M ) { dim_t n_panel; dim_t ps_p; @@ -373,11 +374,8 @@ void bli_packm_init_pack( bool_t densify, // dimension of the matrix is not a whole multiple of NR. ps_p = m_p_pad * rs_p; - // Align the panel dimension according to the contiguous memory - // stride alignment size so that the second, third, etc panels begin - // at aligned addresses. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, - BLIS_CONTIG_STRIDE_ALIGN_SIZE ); + if ( pack_schema == BLIS_PACKED_COL_PANELS_3M ) + ps_p = ( ps_p * 3 ) / 2; // Store the strides and panel dimension in p. bli_obj_set_incs( rs_p, cs_p, *p ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 3d9adc203..45d12be1a 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -45,8 +45,8 @@ static FUNCPTR_T vars[6][3] = // unblocked optimized unblocked blocked { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, + { NULL, NULL, bli_packm_blk_var3 }, + { NULL, NULL, bli_packm_blk_var4 }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, }; diff --git a/frame/1m/packm/bli_packm_tri_cxk.c b/frame/1m/packm/bli_packm_tri_cxk.c index 128b24461..7de7f2717 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.c +++ b/frame/1m/packm/bli_packm_tri_cxk.c @@ -82,6 +82,7 @@ void PASTEMAC(ch,varname)( \ ldc = cs_c; \ ldp = cs_p; \ } \ +\ \ /* Pack the panel. */ \ PASTEMAC(ch,packm_cxk)( conjc, \ @@ -91,44 +92,48 @@ void PASTEMAC(ch,varname)( \ c, incc, ldc, \ p, ldp ); \ \ - /* If the diagonal of C is implicitly unit, set the diagonal of - the packed panel to unit. */ \ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ if ( bli_is_unit_diag( diagc ) ) \ { \ - PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \ - m_panel, \ - n_panel, \ - kappa, \ - p, rs_p, cs_p ); \ + PASTEMAC(ch,setd)( diagoffp, \ + m_panel, \ + n_panel, \ + kappa, \ + p, rs_p, cs_p ); \ } \ \ /* If requested, invert the diagonal of the packed panel. */ \ if ( invdiag == TRUE ) \ { \ - PASTEMAC(ch,invertd_unb_var1)( diagoffp, \ - m_panel, \ - n_panel, \ - p, rs_p, cs_p ); \ + PASTEMAC(ch,invertd)( diagoffp, \ + m_panel, \ + n_panel, \ + p, rs_p, cs_p ); \ } \ \ - /* Set the region opposite the diagonal of P to zero. To do this, + /* Set the region opposite the diagonal of p to zero. To do this, we need to reference the "unstored" region on the other side of the diagonal. This amounts to toggling uploc and then shifting the diagonal offset to shrink the newly referenced region (by - one diagonal). */ \ + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ { \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( uplop ); \ bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \ - BLIS_NONUNIT_DIAG, \ - uplop, \ - m_panel, \ - n_panel, \ - zero, \ - p, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( diagoffp, \ + BLIS_NONUNIT_DIAG, \ + uplop, \ + m_panel, \ + n_panel, \ + zero, \ + p, rs_p, cs_p ); \ } \ \ /* The packed memory region was acquired/allocated with "aligned" @@ -145,13 +150,13 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max; \ ctype* p_edge = p + (i )*rs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ \ if ( n_panel != n_panel_max ) \ @@ -161,18 +166,23 @@ void PASTEMAC(ch,varname)( \ dim_t n_edge = n_panel_max - j; \ ctype* p_edge = p + (j )*cs_p; \ \ - PASTEMAC2(ch,ch,setm_unb_var1)( 0, \ - BLIS_NONUNIT_DIAG, \ - BLIS_DENSE, \ - m_edge, \ - n_edge, \ - zero, \ - p_edge, rs_p, cs_p ); \ + PASTEMAC(ch,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero, \ + p_edge, rs_p, cs_p ); \ } \ \ /* If this panel is an edge case in both panel dimension and length, then it must be a bottom-right corner case. Set the part of the - diagonal that extends into the zero-padded region to identity. */ \ + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ if ( m_panel != m_panel_max && \ n_panel != n_panel_max ) \ { \ @@ -181,27 +191,530 @@ void PASTEMAC(ch,varname)( \ dim_t m_br = m_panel_max - i; \ dim_t n_br = n_panel_max - j; \ ctype* one = PASTEMAC(ch,1); \ - ctype* p_edge = p + (i )*rs_p + (j )*cs_p; \ + ctype* p_br = p + (i )*rs_p + (j )*cs_p; \ \ - PASTEMAC2(ch,ch,setd_unb_var1)( 0, \ - m_br, \ - n_br, \ - one, \ - p_edge, rs_p, cs_p ); \ -/* - PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \ - p_edge, rs_p, cs_p, "%4.1f", "" ); \ -*/ \ + PASTEMAC(ch,setd)( 0, \ + m_br, \ + n_br, \ + one, \ + p_br, rs_p, cs_p ); \ } \ +/* + PASTEMAC(ch,fprintm)( stdout, "packm_var1: setting br unit diag", m_br, n_br, \ + p_edge, rs_p, cs_p, "%4.1f", "" ); \ +*/ \ /* if ( rs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", m_panel_max, n_panel_max, \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", m_panel_max, n_panel_max, \ p, rs_p, cs_p, "%4.1f", "" ); \ if ( cs_p == 1 ) \ - PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", m_panel_max, n_panel_max, \ + PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", m_panel_max, n_panel_max, \ p, rs_p, cs_p, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( packm_tri_cxk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + dim_t i; \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ + inc_t rs_p11, cs_p11; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk_ri)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ + ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \ + ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \ +\ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + kappa_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + kappa_i, \ + p11_i, rs_p11, cs_p11 ); \ + } \ +\ + /* If requested, invert the diagonal of the packed panel. */ \ + if ( invdiag == TRUE ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ +\ + PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ + } \ + } \ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + uplo_t uplop11 = uploc; \ + doff_t diagoffp11 = 0; \ +\ + bli_toggle_uplo( uplop11 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ +\ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_i, rs_p11, cs_p11 ); \ + } \ + } \ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ +\ + } \ +\ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ + ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, rs_p, cs_p ); \ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri ) + + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ) \ +{ \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + dim_t i; \ + dim_t panel_dim; \ + dim_t panel_len; \ + dim_t panel_len_max; \ + inc_t incc, ldc; \ + inc_t psp, ldp; \ +\ + inc_t rs_p11, cs_p11; \ +\ +\ + /* If the strides of p indicate row storage, then we are packing to + column panels; otherwise, if the strides indicate column storage, + we are packing to row panels. */ \ + if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ + { \ + /* Prepare to pack to row-stored column panel. */ \ + panel_dim = n_panel; \ + panel_len = m_panel; \ + panel_len_max = m_panel_max; \ + incc = cs_c; \ + ldc = rs_c; \ + ldp = rs_p; \ + rs_p11 = rs_p; \ + cs_p11 = 1; \ + } \ + else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ + { \ + /* Prepare to pack to column-stored row panel. */ \ + panel_dim = m_panel; \ + panel_len = n_panel; \ + panel_len_max = n_panel_max; \ + incc = rs_c; \ + ldc = cs_c; \ + ldp = cs_p; \ + rs_p11 = 1; \ + cs_p11 = cs_p; \ + } \ +\ + /* Compute the panel stride (ie: the element offset to the imaginary + panel). */ \ + psp = ldp * panel_len_max; \ +\ +\ + /* Pack the panel. */ \ + PASTEMAC(ch,packm_cxk_ri3)( conjc, \ + panel_dim, \ + panel_len, \ + kappa, \ + c, incc, ldc, \ + p, psp, ldp ); \ +\ +\ + /* Tweak the panel according to its triangular structure */ \ + { \ + dim_t j = bli_abs( diagoffp ); \ + ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \ + ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \ + ctype_r* p11_ri = ( ctype_r* )p + 2*psp + (j )*ldp; \ +\ + /* If the diagonal of c is implicitly unit, explicitly set the + the diagonal of the packed panel to kappa. */ \ + if ( bli_is_unit_diag( diagc ) ) \ + { \ + ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \ + ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \ +\ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + kappa_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + kappa_i, \ + p11_i, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setd)( 0, \ + m_panel, \ + n_panel, \ + kappa_i, \ + p11_ri, rs_p11, cs_p11 ); \ + } \ +\ + /* If requested, invert the diagonal of the packed panel. Note + that we do not need to update the ri panel since inverted + diagonals are only needed by trsm, which does not use the + p11 section of the ri panel. */ \ + if ( invdiag == TRUE ) \ + { \ + for ( i = 0; i < panel_dim; ++i ) \ + { \ + ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \ + ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \ +\ + PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \ + } \ + } \ +\ + /* Set the region opposite the diagonal of p to zero. To do this, + we need to reference the "unstored" region on the other side of + the diagonal. This amounts to toggling uploc and then shifting + the diagonal offset to shrink the newly referenced region (by + one diagonal). Note that this zero-filling is not needed for + trsm, since the unstored region is not referenced by the trsm + micro-kernel; however, zero-filling is needed for trmm, which + uses the gemm micro-kernel.*/ \ + { \ + uplo_t uplop11 = uploc; \ + doff_t diagoffp11 = 0; \ +\ + bli_toggle_uplo( uplop11 ); \ + bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \ +\ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_r, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_i, rs_p11, cs_p11 ); \ + PASTEMAC(chr,setm)( diagoffp11, \ + BLIS_NONUNIT_DIAG, \ + uplop11, \ + panel_dim, \ + panel_dim, \ + zero_r, \ + p11_ri, rs_p11, cs_p11 ); \ + } \ + } \ +\ +\ + /* The packed memory region was acquired/allocated with "aligned" + dimensions (ie: dimensions that were possibly inflated up to a + multiple). When these dimension are inflated, it creates empty + regions along the bottom and/or right edges of the matrix. If + either region exists, we set them to zero. This allows the + micro-kernel to remain simple since it does not need to support + different register blockings for the edge cases. */ \ + if ( m_panel != m_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t m_edge = m_panel_max - i; \ + dim_t n_edge = n_panel_max; \ + ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +\ + if ( n_panel != n_panel_max ) \ + { \ + dim_t j = n_panel; \ + dim_t m_edge = m_panel_max; \ + dim_t n_edge = n_panel_max - j; \ + ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \ + ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \ + ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \ +\ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_r, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_i, rs_p, cs_p ); \ + PASTEMAC(chr,setm)( 0, \ + BLIS_NONUNIT_DIAG, \ + BLIS_DENSE, \ + m_edge, \ + n_edge, \ + zero_r, \ + p_edge_ri, rs_p, cs_p ); \ + } \ +\ + /* If this panel is an edge case in both panel dimension and length, + then it must be a bottom-right corner case. Set the part of the + diagonal that extends into the zero-padded region to identity. + NOTE: This is actually only necessary when packing for trsm, as + it helps prevent NaNs and Infs from creeping into the computation. + However, we set the region to identity for trmm as well. Those + 1.0's end up getting muliplied by the 0.0's in the zero-padded + region of the other matrix, so there is no harm in this. */ \ + if ( m_panel != m_panel_max && \ + n_panel != n_panel_max ) \ + { \ + dim_t i = m_panel; \ + dim_t j = n_panel; \ + dim_t m_br = m_panel_max - i; \ + dim_t n_br = n_panel_max - j; \ + ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \ + ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \ +\ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + one_r, \ + p_br_r, rs_p, cs_p ); \ + PASTEMAC(chr,setd)( 0, \ + m_br, \ + n_br, \ + zero_r, \ + p_br_i, rs_p, cs_p ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri3 ) + diff --git a/frame/1m/packm/bli_packm_tri_cxk.h b/frame/1m/packm/bli_packm_tri_cxk.h index 67fa87ed6..771d75af0 100644 --- a/frame/1m/packm/bli_packm_tri_cxk.h +++ b/frame/1m/packm/bli_packm_tri_cxk.h @@ -53,3 +53,27 @@ void PASTEMAC(ch,varname)( \ INSERT_GENTPROT_BASIC( packm_tri_cxk ) + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + struc_t strucc, \ + doff_t diagoffp, \ + diag_t diagc, \ + uplo_t uploc, \ + conj_t conjc, \ + bool_t invdiag, \ + dim_t m_panel, \ + dim_t n_panel, \ + dim_t m_panel_max, \ + dim_t n_panel_max, \ + ctype* restrict kappa, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + ctype* restrict p, inc_t rs_p, inc_t cs_p \ + ); + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri ) + +INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri3 ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_10xk.c b/frame/1m/packm/ukernels/bli_packm_ref_10xk.c index 7ae8262e0..4ac18bf83 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_10xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_10xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,16 +55,16 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -74,16 +74,16 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -96,16 +96,16 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -115,16 +115,16 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -133,5 +133,246 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_10xk, packm_ref_10xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_10xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_10xk.h b/frame/1m/packm/ukernels/bli_packm_ref_10xk.h index 926b0c908..5d39a278e 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_10xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_10xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_10xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_12xk.c b/frame/1m/packm/ukernels/bli_packm_ref_12xk.c index 4a608b5bd..50ade8e99 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_12xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_12xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,18 +55,18 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -76,18 +76,18 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -100,18 +100,18 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -121,18 +121,18 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -141,5 +141,262 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_12xk, packm_ref_12xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_12xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_12xk.h b/frame/1m/packm/ukernels/bli_packm_ref_12xk.h index e4258ff48..80f82a001 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_12xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_12xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_12xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_14xk.c b/frame/1m/packm/ukernels/bli_packm_ref_14xk.c index be7632d23..5e0dd53f2 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_14xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_14xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,20 +55,20 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -78,20 +78,20 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -104,20 +104,20 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -127,20 +127,20 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -149,5 +149,278 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_14xk, packm_ref_14xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_14xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_14xk.h b/frame/1m/packm/ukernels/bli_packm_ref_14xk.h index a2337722b..e9e546d41 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_14xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_14xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_14xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_16xk.c b/frame/1m/packm/ukernels/bli_packm_ref_16xk.c index 5373ac41e..f6f8c5eff 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_16xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_16xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,22 +55,22 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 14*inca), *(pi1 + 14) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 15*inca), *(pi1 + 15) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -80,22 +80,22 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 14*inca), *(pi1 + 14) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 15*inca), *(pi1 + 15) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -108,22 +108,22 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -133,22 +133,22 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -157,5 +157,294 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_16xk, packm_ref_16xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_16xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_16xk.h b/frame/1m/packm/ukernels/bli_packm_ref_16xk.h index c572a4a7d..6b4169cbf 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_16xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_16xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_16xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_2xk.c b/frame/1m/packm/ukernels/bli_packm_ref_2xk.c index 0db8c8b52..3c16a94a4 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_2xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_2xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,8 +55,8 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -66,8 +66,8 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -80,8 +80,8 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -91,8 +91,8 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -101,5 +101,182 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_2xk, packm_ref_2xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_2xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_2xk.h b/frame/1m/packm/ukernels/bli_packm_ref_2xk.h index cd8d26928..e9b267f7e 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_2xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_2xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_2xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_4xk.c b/frame/1m/packm/ukernels/bli_packm_ref_4xk.c index 162199d72..f17add370 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_4xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_4xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -109,5 +109,198 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_4xk, packm_ref_4xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_4xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_4xk.h b/frame/1m/packm/ukernels/bli_packm_ref_4xk.h index a77b5f826..e39f5a894 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_4xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_4xk.h @@ -44,3 +44,32 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_4xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri3 ) + diff --git a/frame/1m/packm/ukernels/bli_packm_ref_6xk.c b/frame/1m/packm/ukernels/bli_packm_ref_6xk.c index 700129ebd..2392fb843 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_6xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_6xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,12 +55,12 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -70,12 +70,12 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -88,12 +88,12 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -103,12 +103,12 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -117,5 +117,214 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_6xk, packm_ref_6xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_6xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_6xk.h b/frame/1m/packm/ukernels/bli_packm_ref_6xk.h index 17dbd5866..1f992ea06 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_6xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_6xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_6xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_8xk.c b/frame/1m/packm/ukernels/bli_packm_ref_8xk.c index 912b4254d..d9c999fd1 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_8xk.c +++ b/frame/1m/packm/ukernels/bli_packm_ref_8xk.c @@ -35,7 +35,7 @@ #include "blis.h" #undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, varname ) \ +#define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ @@ -55,14 +55,14 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -72,14 +72,14 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -92,14 +92,14 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -109,14 +109,14 @@ void PASTEMAC(ch,varname)( \ { \ for ( ; n != 0; --n ) \ { \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ - PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ + PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ @@ -125,5 +125,230 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC( packm_ref_8xk, packm_ref_8xk ) +INSERT_GENTFUNC_BASIC0( packm_ref_8xk ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ + PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri ) + + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ) \ +{ \ + const inc_t inca2 = 2 * inca; \ + const inc_t lda2 = 2 * lda; \ +\ + ctype* beta_cast = beta; \ + ctype_r* restrict beta_r = ( ctype_r* )beta; \ + ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \ + ctype_r* restrict alpha1_r = ( ctype_r* )a; \ + ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ + ctype_r* restrict pi1_r = ( ctype_r* )p; \ + ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \ + ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \ +\ + if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ + else \ + { \ + if ( bli_is_conj( conja ) ) \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + else \ + { \ + for ( ; n != 0; --n ) \ + { \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \ + PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \ +\ + alpha1_r += lda2; \ + alpha1_i += lda2; \ + pi1_r += ldp; \ + pi1_i += ldp; \ + pi1_ri += ldp; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri3 ) diff --git a/frame/1m/packm/ukernels/bli_packm_ref_8xk.h b/frame/1m/packm/ukernels/bli_packm_ref_8xk.h index 442e68c27..572d04a0d 100644 --- a/frame/1m/packm/ukernels/bli_packm_ref_8xk.h +++ b/frame/1m/packm/ukernels/bli_packm_ref_8xk.h @@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( packm_ref_8xk ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri ) + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + conj_t conja, \ + dim_t n, \ + void* beta, \ + void* a, inc_t inca, inc_t lda, \ + void* p, inc_t psp, inc_t ldp \ + ); + +INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri3 ) diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index ab5a0fea0..f26755554 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,7 +37,7 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_unb_var1.h" -#include "bli_unpackm_blk_var1.h" +//#include "bli_unpackm_blk_var1.h" #include "bli_unpackm_blk_var2.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.c b/frame/1m/unpackm/bli_unpackm_blk_var2.c index 01efdaa3d..9e7869ef6 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var2.c @@ -44,7 +44,8 @@ typedef void (*FUNCPTR_T)( trans_t transc, dim_t m, dim_t n, - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p, + void* p, inc_t rs_p, inc_t cs_p, + inc_t pd_p, inc_t ps_p, void* c, inc_t rs_c, inc_t cs_c ); @@ -82,6 +83,7 @@ void bli_unpackm_blk_var2( obj_t* p, void* buf_p = bli_obj_buffer_at_off( *p ); inc_t rs_p = bli_obj_row_stride( *p ); inc_t cs_p = bli_obj_col_stride( *p ); + inc_t pd_p = bli_obj_panel_dim( *p ); inc_t ps_p = bli_obj_panel_stride( *p ); void* buf_c = bli_obj_buffer_at_off( *c ); @@ -102,7 +104,8 @@ void bli_unpackm_blk_var2( obj_t* p, transc, m_c, n_c, - buf_p, rs_p, cs_p, ps_p, + buf_p, rs_p, cs_p, + pd_p, ps_p, buf_c, rs_c, cs_c ); } @@ -118,25 +121,33 @@ void PASTEMAC(ch,varname )( \ trans_t transc, \ dim_t m, \ dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p, \ + void* p, inc_t rs_p, inc_t cs_p, \ + inc_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c \ ) \ { \ - ctype* one = PASTEMAC(ch,1); \ - ctype* c_cast = c; \ - ctype* p_cast = p; \ - ctype* c_begin; \ - ctype* p_begin; \ - dim_t panel_dim; \ - dim_t panel_len; \ - dim_t iter_dim; \ - doff_t diagoffc_i; \ - dim_t panel_dim_i; \ - dim_t ic, ip; \ - inc_t diagoffc_inc, vs_c; \ - inc_t incc, ldc; \ - dim_t* m_panel; \ - dim_t* n_panel; \ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict c_cast = c; \ + ctype* restrict p_cast = p; \ + ctype* restrict c_begin; \ + ctype* restrict p_begin; \ +\ + dim_t iter_dim; \ + dim_t num_iter; \ + dim_t it, ic, ip; \ + dim_t ic0, ip0; \ + doff_t ic_inc, ip_inc; \ + doff_t diagoffc_i; \ + doff_t diagoffc_inc; \ + dim_t panel_len; \ + dim_t panel_dim_i; \ + dim_t panel_dim_max; \ + inc_t vs_c; \ + inc_t incc, ldc; \ + inc_t ldp; \ + dim_t* m_panel; \ + dim_t* n_panel; \ +\ \ /* If c needs a transposition, induce it so that we can more simply express the remaining parameters and code. */ \ @@ -154,39 +165,51 @@ void PASTEMAC(ch,varname )( \ if ( bli_is_row_stored_f( rs_p, cs_p ) ) \ { \ /* Prepare to unpack from column panels. */ \ - iter_dim = n; \ - panel_len = m; \ - panel_dim = rs_p; \ - incc = cs_c; \ - ldc = rs_c; \ - vs_c = cs_c; \ - diagoffc_inc = -( doff_t)panel_dim; \ - m_panel = &m; \ - n_panel = &panel_dim_i; \ + iter_dim = n; \ + panel_len = m; \ + panel_dim_max = pd_p; \ + incc = cs_c; \ + ldc = rs_c; \ + vs_c = cs_c; \ + diagoffc_inc = -( doff_t)panel_dim_max; \ + ldp = rs_p; \ + m_panel = &m; \ + n_panel = &panel_dim_i; \ } \ else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \ { \ /* Prepare to unpack from row panels. */ \ - iter_dim = m; \ - panel_len = n; \ - panel_dim = cs_p; \ - incc = rs_c; \ - ldc = cs_c; \ - vs_c = rs_c; \ - diagoffc_inc = ( doff_t )panel_dim; \ - m_panel = &panel_dim_i; \ - n_panel = &n; \ + iter_dim = m; \ + panel_len = n; \ + panel_dim_max = pd_p; \ + incc = rs_c; \ + ldc = cs_c; \ + vs_c = rs_c; \ + diagoffc_inc = ( doff_t )panel_dim_max; \ + ldp = cs_p; \ + m_panel = &panel_dim_i; \ + n_panel = &n; \ } \ \ -\ - for ( ic = 0, ip = 0, diagoffc_i = diagoffc; ic < iter_dim; \ - ic += panel_dim, ip += 1, diagoffc_i += diagoffc_inc ) \ + /* Compute the total number of iterations we'll need. */ \ + num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ { \ - panel_dim_i = bli_min( panel_dim, iter_dim - ic ); \ + ic0 = 0; \ + ic_inc = panel_dim_max; \ + ip0 = 0; \ + ip_inc = 1; \ + } \ \ - p_begin = p_cast + ip * ps_p; \ - c_begin = c_cast + ic * vs_c; \ + for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ + ic += ic_inc, ip += ip_inc, it += 1 ) \ + { \ + panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ +\ + diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ +\ + p_begin = p_cast + ip * ps_p; \ + c_begin = c_cast + ic * vs_c; \ \ /* If the current panel of C intersects the diagonal AND is upper or lower stored, then we must call scal2m. Otherwise, we can use a @@ -212,7 +235,7 @@ void PASTEMAC(ch,varname )( \ panel_dim_i, \ panel_len, \ one, \ - p_begin, panel_dim, \ + p_begin, ldp, \ c_begin, incc, ldc ); \ } \ \ diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.h b/frame/1m/unpackm/bli_unpackm_blk_var2.h index 69f721136..c50144390 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.h +++ b/frame/1m/unpackm/bli_unpackm_blk_var2.h @@ -48,7 +48,8 @@ void PASTEMAC(ch,varname)( \ trans_t transc, \ dim_t m, \ dim_t n, \ - void* p, inc_t rs_p, inc_t cs_p, inc_t ps_p, \ + void* p, inc_t rs_p, inc_t cs_p, \ + inc_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c \ ); diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c index bc9fa1c6a..75f562db2 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/unpackm/bli_unpackm_cxk.c @@ -47,120 +47,106 @@ typedef void (*FUNCPTR_T)( #undef FUNCPTR_ARRAY_LENGTH #define FUNCPTR_ARRAY_LENGTH 18 -#undef GENARRAY -#define GENARRAY( kername2, kername4, kername6, kername8, \ - kername10, kername12, kername14, kername16 ) \ -\ -static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \ -{ \ - /* panel width = 0 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 1 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 2 */ \ - { \ - PASTEMAC(s,kername2), \ - PASTEMAC(c,kername2), \ - PASTEMAC(d,kername2), \ - PASTEMAC(z,kername2), \ - }, \ - /* panel width = 3 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 4 */ \ - { \ - PASTEMAC(s,kername4), \ - PASTEMAC(c,kername4), \ - PASTEMAC(d,kername4), \ - PASTEMAC(z,kername4), \ - }, \ - /* panel width = 5 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 6 */ \ - { \ - PASTEMAC(s,kername6), \ - PASTEMAC(c,kername6), \ - PASTEMAC(d,kername6), \ - PASTEMAC(z,kername6), \ - }, \ - /* panel width = 7 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 8 */ \ - { \ - PASTEMAC(s,kername8), \ - PASTEMAC(c,kername8), \ - PASTEMAC(d,kername8), \ - PASTEMAC(z,kername8), \ - }, \ - /* panel width = 9 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 10 */ \ - { \ - PASTEMAC(s,kername10), \ - PASTEMAC(c,kername10), \ - PASTEMAC(d,kername10), \ - PASTEMAC(z,kername10), \ - }, \ - /* panel width = 11 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 12 */ \ - { \ - PASTEMAC(s,kername12), \ - PASTEMAC(c,kername12), \ - PASTEMAC(d,kername12), \ - PASTEMAC(z,kername12), \ - }, \ - /* panel width = 13 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 14 */ \ - { \ - PASTEMAC(s,kername14), \ - PASTEMAC(c,kername14), \ - PASTEMAC(d,kername14), \ - PASTEMAC(z,kername14), \ - }, \ - /* panel width = 15 */ \ - { \ - NULL, NULL, NULL, NULL, \ - }, \ - /* panel width = 16 */ \ - { \ - PASTEMAC(s,kername16), \ - PASTEMAC(c,kername16), \ - PASTEMAC(d,kername16), \ - PASTEMAC(z,kername16), \ - }, \ - /* panel width = 17 */ \ - { \ - NULL, NULL, NULL, NULL, \ - } \ +static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = +{ + /* panel width = 0 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 1 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 2 */ + { + BLIS_SUNPACKM_2XK_KERNEL, + BLIS_CUNPACKM_2XK_KERNEL, + BLIS_DUNPACKM_2XK_KERNEL, + BLIS_ZUNPACKM_2XK_KERNEL, + }, + /* panel width = 3 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 4 */ + { + BLIS_SUNPACKM_4XK_KERNEL, + BLIS_CUNPACKM_4XK_KERNEL, + BLIS_DUNPACKM_4XK_KERNEL, + BLIS_ZUNPACKM_4XK_KERNEL, + }, + /* panel width = 5 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 6 */ + { + BLIS_SUNPACKM_6XK_KERNEL, + BLIS_CUNPACKM_6XK_KERNEL, + BLIS_DUNPACKM_6XK_KERNEL, + BLIS_ZUNPACKM_6XK_KERNEL, + }, + /* panel width = 7 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 8 */ + { + BLIS_SUNPACKM_8XK_KERNEL, + BLIS_CUNPACKM_8XK_KERNEL, + BLIS_DUNPACKM_8XK_KERNEL, + BLIS_ZUNPACKM_8XK_KERNEL, + }, + /* panel width = 9 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 10 */ + { + BLIS_SUNPACKM_10XK_KERNEL, + BLIS_CUNPACKM_10XK_KERNEL, + BLIS_DUNPACKM_10XK_KERNEL, + BLIS_ZUNPACKM_10XK_KERNEL, + }, + /* panel width = 11 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 12 */ + { + BLIS_SUNPACKM_12XK_KERNEL, + BLIS_CUNPACKM_12XK_KERNEL, + BLIS_DUNPACKM_12XK_KERNEL, + BLIS_ZUNPACKM_12XK_KERNEL, + }, + /* panel width = 13 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 14 */ + { + BLIS_SUNPACKM_14XK_KERNEL, + BLIS_CUNPACKM_14XK_KERNEL, + BLIS_DUNPACKM_14XK_KERNEL, + BLIS_ZUNPACKM_14XK_KERNEL, + }, + /* panel width = 15 */ + { + NULL, NULL, NULL, NULL, + }, + /* panel width = 16 */ + { + BLIS_SUNPACKM_16XK_KERNEL, + BLIS_CUNPACKM_16XK_KERNEL, + BLIS_DUNPACKM_16XK_KERNEL, + BLIS_ZUNPACKM_16XK_KERNEL, + }, + /* panel width = 17 */ + { + NULL, NULL, NULL, NULL, + }, }; -GENARRAY( UNPACKM_2XK_KERNEL, - UNPACKM_4XK_KERNEL, - UNPACKM_6XK_KERNEL, - UNPACKM_8XK_KERNEL, - UNPACKM_10XK_KERNEL, - UNPACKM_12XK_KERNEL, - UNPACKM_14XK_KERNEL, - UNPACKM_16XK_KERNEL ) - - #undef GENTFUNC diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 3b2061d93..f94bbb423 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -43,7 +43,7 @@ typedef void (*FUNCPTR_T)( obj_t* p, static FUNCPTR_T vars[2][3] = { // unblocked optimized unblocked blocked - { bli_unpackm_unb_var1, NULL, bli_unpackm_blk_var1, }, + { bli_unpackm_unb_var1, NULL, NULL, }, { NULL, NULL, bli_unpackm_blk_var2, }, }; diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/old/bli_unpackm_blk_var1.c similarity index 100% rename from frame/1m/unpackm/bli_unpackm_blk_var1.c rename to frame/1m/unpackm/old/bli_unpackm_blk_var1.c diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/old/bli_unpackm_blk_var1.h similarity index 100% rename from frame/1m/unpackm/bli_unpackm_blk_var1.h rename to frame/1m/unpackm/old/bli_unpackm_blk_var1.h diff --git a/frame/3/gemm/3m/bli_gemm3m.c b/frame/3/gemm/3m/bli_gemm3m.c new file mode 100644 index 000000000..da2b211c3 --- /dev/null +++ b/frame/3/gemm/3m/bli_gemm3m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3m_cntl; +extern gemm_t* gemm_cntl; + +// +// Define object-based interface. +// +void bli_gemm3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + gemm_t* cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = gemm3m_cntl; + else cntl = gemm_cntl; + + bli_gemm_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm3m, gemm3m ) + diff --git a/frame/3/gemm/3m/bli_gemm3m.h b/frame/3/gemm/3m/bli_gemm3m.h new file mode 100644 index 000000000..f07f9dbbf --- /dev/null +++ b/frame/3/gemm/3m/bli_gemm3m.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm3m_cntl.h" + +#include "bli_gemm3m_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm3m ) + diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c new file mode 100644 index 000000000..9f1f33fec --- /dev/null +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -0,0 +1,220 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm3m_mc; +blksz_t* gemm3m_nc; +blksz_t* gemm3m_kc; +blksz_t* gemm3m_mr; +blksz_t* gemm3m_nr; +blksz_t* gemm3m_kr; + +func_t* gemm3m_ukrs; + +packm_t* gemm3m_packa_cntl; +packm_t* gemm3m_packb_cntl; + +gemm_t* gemm3m_cntl_bp_ke; +gemm_t* gemm3m_cntl_op_bp; +gemm_t* gemm3m_cntl_mm_op; +gemm_t* gemm3m_cntl_vl_mm; + +gemm_t* gemm3m_cntl; + + +void bli_gemm3m_cntl_init() +{ + // Create blocksize objects for each dimension. + gemm3m_mc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_MC_C, BLIS_EXTEND_3M_MC_C, + BLIS_DEFAULT_3M_MC_Z, BLIS_EXTEND_3M_MC_Z ); + + gemm3m_nc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_NC_C, BLIS_EXTEND_3M_NC_C, + BLIS_DEFAULT_3M_NC_Z, BLIS_EXTEND_3M_NC_Z ); + + gemm3m_kc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_KC_C, BLIS_EXTEND_3M_KC_C, + BLIS_DEFAULT_3M_KC_Z, BLIS_EXTEND_3M_KC_Z ); + + gemm3m_mr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_MR_C, BLIS_EXTEND_3M_MR_C, + BLIS_DEFAULT_3M_MR_Z, BLIS_EXTEND_3M_MR_Z ); + + gemm3m_nr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_NR_C, BLIS_EXTEND_3M_NR_C, + BLIS_DEFAULT_3M_NR_Z, BLIS_EXTEND_3M_NR_Z ); + + gemm3m_kr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_3M_KR_C, BLIS_EXTEND_3M_KR_C, + BLIS_DEFAULT_3M_KR_Z, BLIS_EXTEND_3M_KR_Z ); + + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm3m_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMM3M_UKERNEL, + BLIS_ZGEMM3M_UKERNEL ); + + + // Create control tree objects for packm operations. + gemm3m_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_mr, + gemm3m_kr, + TRUE, // densify; used by hemm/symm + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm3m_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kr, + gemm3m_nr, + TRUE, // densify; used by hemm/symm + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // + // Create a control tree for packing A and B, and streaming C. + // + + // Create control tree object for lowest-level block-panel kernel. + gemm3m_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3m_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem. + gemm3m_cntl_op_bp + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, + NULL, + gemm3m_packa_cntl, + gemm3m_packb_cntl, + NULL, + gemm3m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. + gemm3m_cntl_mm_op + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm3m_cntl_op_bp, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. + gemm3m_cntl_vl_mm + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm3m_cntl_mm_op, + NULL ); + + // Alias the "master" gemm control tree to a shorter name. + gemm3m_cntl = gemm3m_cntl_vl_mm; + +} + +void bli_gemm3m_cntl_finalize() +{ + bli_blksz_obj_free( gemm3m_mc ); + bli_blksz_obj_free( gemm3m_nc ); + bli_blksz_obj_free( gemm3m_kc ); + bli_blksz_obj_free( gemm3m_mr ); + bli_blksz_obj_free( gemm3m_nr ); + bli_blksz_obj_free( gemm3m_kr ); + + bli_func_obj_free( gemm3m_ukrs ); + + bli_cntl_obj_free( gemm3m_packa_cntl ); + bli_cntl_obj_free( gemm3m_packb_cntl ); + + bli_cntl_obj_free( gemm3m_cntl_bp_ke ); + bli_cntl_obj_free( gemm3m_cntl_op_bp ); + bli_cntl_obj_free( gemm3m_cntl_mm_op ); + bli_cntl_obj_free( gemm3m_cntl_vl_mm ); + +} + diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.h b/frame/3/gemm/3m/bli_gemm3m_cntl.h new file mode 100644 index 000000000..3cef983fc --- /dev/null +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm3m_cntl_init( void ); +void bli_gemm3m_cntl_finalize( void ); + diff --git a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c new file mode 100644 index 000000000..93fba2a76 --- /dev/null +++ b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c @@ -0,0 +1,208 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct_r[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype_r ct_i[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ct = 1; \ + const inc_t cs_ct = PASTEMAC(chr,mr); \ +\ +\ + ctype_r ab_r[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype_r ab_i[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = PASTEMAC(chr,mr); \ +\ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \ + const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \ + ctype_r* restrict a_ri = ( ctype_r* )a + 2*ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ + ctype_r* restrict b_ri = ( ctype_r* )b + 2*ps_b; \ +\ + ctype_r* restrict c_r = ( ctype_r* )c; \ + ctype_r* restrict c_i = ( ctype_r* )c + 1; \ +\ + const inc_t rs_c2 = 2 * rs_c; \ + const inc_t cs_c2 = 2 * cs_c; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 3m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on beta, but to do + so we have to manually scale c and then use beta == 1 for the + micro-kernel calls. */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + PASTEMAC(ch,scal2ris)( *beta_r, \ + *beta_i, \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2), \ + *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct) ); \ +\ + /* Use beta.r == 1.0. */ \ + beta_r = one_r; \ + } \ + else \ + { \ + /* Copy c to ct without scaling. */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2), \ + *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct) ); \ + } \ +\ +\ + /* c.r = beta.r * c.r + a.r * b.r - a.i * b.i; + c.i = beta.r * c.i + (a.r + a.i)(b.r + b.i) - a.r * b.r - a.i * b.i; */ \ +\ + bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ +\ + /* ab.r = a.r * b.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + ab_r, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \ +\ + /* ab.i = a.i * b.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_i, \ + b_i, \ + zero_r, \ + ab_i, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* ct.i = a.ri * b.ri; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_ri, \ + b_ri, \ + beta_r, \ + ct_i, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* ct.r = beta.r * ct.r + ab.r; + ct.r = ct.r - ab.i; + ct.i = ct.i - ab.r; + ct.i = ct.i - ab.i; */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ + ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ + ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \ + ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \ +\ + PASTEMAC(chr,scals)( *beta_r, gammat_r ); \ +\ + PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \ + PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \ + PASTEMAC(chr,subs)( alphabeta_r, gammat_i ); \ + PASTEMAC(chr,subs)( alphabeta_i, gammat_i ); \ +\ + /* Store the local values (from ct) back to c. */ \ + PASTEMAC(ch,copyris)( gammat_r, \ + gammat_i, \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2) ); \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm3m_ukr_ref, GEMM_UKERNEL ) + diff --git a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.h b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.h similarity index 94% rename from kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.h rename to frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.h index f9349a215..228606348 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.h +++ b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.h @@ -33,8 +33,8 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -46,5 +46,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemm_opt_d4x4 ) +INSERT_GENTPROTCO_BASIC( gemm3m_ukr_ref ) diff --git a/frame/3/gemm/4m/bli_gemm4m.c b/frame/3/gemm/4m/bli_gemm4m.c new file mode 100644 index 000000000..1634bf795 --- /dev/null +++ b/frame/3/gemm/4m/bli_gemm4m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4m_cntl; +extern gemm_t* gemm_cntl; + +// +// Define object-based interface. +// +void bli_gemm4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + gemm_t* cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = gemm4m_cntl; + else cntl = gemm_cntl; + + bli_gemm_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( gemm4m, gemm4m ) + diff --git a/frame/3/gemm/4m/bli_gemm4m.h b/frame/3/gemm/4m/bli_gemm4m.h new file mode 100644 index 000000000..47e35af0d --- /dev/null +++ b/frame/3/gemm/4m/bli_gemm4m.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_gemm4m_cntl.h" + +#include "bli_gemm4m_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_gemm4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm4m ) + diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c new file mode 100644 index 000000000..1576d181c --- /dev/null +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -0,0 +1,219 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +blksz_t* gemm4m_mc; +blksz_t* gemm4m_nc; +blksz_t* gemm4m_kc; +blksz_t* gemm4m_mr; +blksz_t* gemm4m_nr; +blksz_t* gemm4m_kr; + +func_t* gemm4m_ukrs; + +packm_t* gemm4m_packa_cntl; +packm_t* gemm4m_packb_cntl; + +gemm_t* gemm4m_cntl_bp_ke; +gemm_t* gemm4m_cntl_op_bp; +gemm_t* gemm4m_cntl_mm_op; +gemm_t* gemm4m_cntl_vl_mm; + +gemm_t* gemm4m_cntl; + + +void bli_gemm4m_cntl_init() +{ + // Create blocksize objects for each dimension. + gemm4m_mc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_MC_C, BLIS_EXTEND_4M_MC_C, + BLIS_DEFAULT_4M_MC_Z, BLIS_EXTEND_4M_MC_Z ); + + gemm4m_nc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_NC_C, BLIS_EXTEND_4M_NC_C, + BLIS_DEFAULT_4M_NC_Z, BLIS_EXTEND_4M_NC_Z ); + + gemm4m_kc + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_KC_C, BLIS_EXTEND_4M_KC_C, + BLIS_DEFAULT_4M_KC_Z, BLIS_EXTEND_4M_KC_Z ); + + gemm4m_mr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_MR_C, BLIS_EXTEND_4M_MR_C, + BLIS_DEFAULT_4M_MR_Z, BLIS_EXTEND_4M_MR_Z ); + + gemm4m_nr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_NR_C, BLIS_EXTEND_4M_NR_C, + BLIS_DEFAULT_4M_NR_Z, BLIS_EXTEND_4M_NR_Z ); + + gemm4m_kr + = + bli_blksz_obj_create( 0, 0, 0, 0, + BLIS_DEFAULT_4M_KR_C, BLIS_EXTEND_4M_KR_C, + BLIS_DEFAULT_4M_KR_Z, BLIS_EXTEND_4M_KR_Z ); + + + + // Create function pointer object for each datatype-specific gemm + // micro-kernel. + gemm4m_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMM4M_UKERNEL, + BLIS_ZGEMM4M_UKERNEL ); + + + // Create control tree objects for packm operations. + gemm4m_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_mr, + gemm4m_kr, + TRUE, // densify; used by hemm/symm + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + gemm4m_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_kr, + gemm4m_nr, + TRUE, // densify; used by hemm/symm + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // + // Create a control tree for packing A and B, and streaming C. + // + + // Create control tree object for lowest-level block-panel kernel. + gemm4m_cntl_bp_ke + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4m_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem. + gemm4m_cntl_op_bp + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, + NULL, + gemm4m_packa_cntl, + gemm4m_packb_cntl, + NULL, + gemm4m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. + gemm4m_cntl_mm_op + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm4m_cntl_op_bp, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. + gemm4m_cntl_vl_mm + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm4m_cntl_mm_op, + NULL ); + + // Alias the "master" gemm control tree to a shorter name. + gemm4m_cntl = gemm4m_cntl_vl_mm; + +} + +void bli_gemm4m_cntl_finalize() +{ + bli_blksz_obj_free( gemm4m_mc ); + bli_blksz_obj_free( gemm4m_nc ); + bli_blksz_obj_free( gemm4m_kc ); + bli_blksz_obj_free( gemm4m_mr ); + bli_blksz_obj_free( gemm4m_nr ); + bli_blksz_obj_free( gemm4m_kr ); + + bli_func_obj_free( gemm4m_ukrs ); + + bli_cntl_obj_free( gemm4m_packa_cntl ); + bli_cntl_obj_free( gemm4m_packb_cntl ); + + bli_cntl_obj_free( gemm4m_cntl_bp_ke ); + bli_cntl_obj_free( gemm4m_cntl_op_bp ); + bli_cntl_obj_free( gemm4m_cntl_mm_op ); + bli_cntl_obj_free( gemm4m_cntl_vl_mm ); +} + diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.h b/frame/3/gemm/4m/bli_gemm4m_cntl.h new file mode 100644 index 000000000..87cb6c05d --- /dev/null +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_gemm4m_cntl_init( void ); +void bli_gemm4m_cntl_finalize( void ); + diff --git a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c new file mode 100644 index 000000000..246750ca5 --- /dev/null +++ b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ct_r[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype_r ct_i[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ct = 1; \ + const inc_t cs_ct = PASTEMAC(chr,mr); \ +\ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = bli_auxinfo_ps_a( data ); \ + const inc_t ps_b = bli_auxinfo_ps_b( data ); \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ + ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ + ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \ +\ + ctype_r* restrict c_r = ( ctype_r* )c; \ + ctype_r* restrict c_i = ( ctype_r* )c + 1; \ +\ + const inc_t rs_c2 = 2 * rs_c; \ + const inc_t cs_c2 = 2 * cs_c; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 4m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on beta, but to do + so we have to manually scale c and then use beta == 1 for the + micro-kernel calls. */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,scal2ris)( *beta_r, \ + *beta_i, \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2), \ + *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct) ); \ +\ + /* Use beta.r == 1.0. */ \ + beta_r = one_r; \ + } \ + else \ + { \ + /* Copy c to ct without scaling. */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2), \ + *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct) ); \ + } \ +\ +\ + /* c.r = beta.r * c.r + alpha.r * a.r * b.r + - alpha.r * a.i * b.i; + c.i = beta.r * c.i + alpha.r * a.r * b.i + + alpha.r * a.i * b.r; */ \ +\ + bli_auxinfo_set_next_ab( a_r, b_i, *data ); \ +\ + /* c.r = beta * c.r + a.r * b.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + ct_r, rs_ct, cs_ct, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_i, b_r, *data ); \ +\ + /* c.i = beta * c.i + a.r * b.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_r, \ + b_i, \ + beta_r, \ + ct_i, rs_ct, cs_ct, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ +\ + /* c.i = 1.0 * c.i + a.i * b.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + alpha_r, \ + a_i, \ + b_r, \ + one_r, \ + ct_i, rs_ct, cs_ct, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* c.r = 1.0 * c.r - a.i * b.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + &m_alpha_r, \ + a_i, \ + b_i, \ + one_r, \ + ct_r, rs_ct, cs_ct, \ + data ); \ +\ +\ + /* Copy the final result in ct back to c. */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct), \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2) ); \ +} + +INSERT_GENTFUNCCO_BASIC( gemm4m_ukr_ref, GEMM_UKERNEL ) + diff --git a/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.h similarity index 94% rename from kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h rename to frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.h index 7e3491938..4049be320 100644 --- a/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h +++ b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.h @@ -33,8 +33,8 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -46,5 +46,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemm_opt_8x4_ref_u4_nodupl_avx1 ) +INSERT_GENTPROTCO_BASIC( gemm4m_ukr_ref ) diff --git a/frame/3/gemm/bli_gemm.c b/frame/3/gemm/bli_gemm.c index 4ee188ff4..e028ba0ee 100644 --- a/frame/3/gemm/bli_gemm.c +++ b/frame/3/gemm/bli_gemm.c @@ -45,6 +45,17 @@ void bli_gemm( obj_t* alpha, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_gemm4m( alpha, a, b, beta, c ); + bli_gemm_front( alpha, a, b, beta, c, gemm_cntl ); } @@ -88,11 +99,11 @@ void PASTEMAC(ch,opname)( \ bli_obj_set_conjtrans( transa, ao ); \ bli_obj_set_conjtrans( transb, bo ); \ \ - PASTEMAC0(varname)( &alphao, \ - &ao, \ - &bo, \ - &betao, \ - &co ); \ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ } INSERT_GENTFUNC_BASIC( gemm, gemm ) diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index 64a97777d..5c19af504 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -45,7 +45,10 @@ #include "bli_gemm_ker_var2.h" #include "bli_gemm_ker_var5.h" -#include "bli_gemm_ref_mxn.h" +#include "bli_gemm_ukr_ref.h" + +#include "bli_gemm4m.h" +#include "bli_gemm3m.h" // diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 19458e52e..42891c924 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -95,11 +95,13 @@ void bli_gemm_cntl_init() BLIS_DEFAULT_NR_C, BLIS_EXTEND_NR_C, BLIS_DEFAULT_NR_Z, BLIS_EXTEND_NR_Z ); - gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, BLIS_EXTEND_KR_S, - BLIS_DEFAULT_KR_D, BLIS_EXTEND_KR_D, - BLIS_DEFAULT_KR_C, BLIS_EXTEND_KR_C, - BLIS_DEFAULT_KR_Z, BLIS_EXTEND_KR_Z ); + gemm_kr = bli_blksz_obj_create( BLIS_DEFAULT_KR_S, 0, + BLIS_DEFAULT_KR_D, 0, + BLIS_DEFAULT_KR_C, 0, + BLIS_DEFAULT_KR_Z, 0 ); + // Create function pointer object for each datatype-specific gemm + // micro-kernel. gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_DGEMM_UKERNEL, BLIS_CGEMM_UKERNEL, diff --git a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c b/frame/3/gemm/ukernels/bli_gemm_ukr_ref.c similarity index 99% rename from frame/3/gemm/ukernels/bli_gemm_ref_mxn.c rename to frame/3/gemm/ukernels/bli_gemm_ukr_ref.c index 2a4f8818c..f1c383d1e 100644 --- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.c +++ b/frame/3/gemm/ukernels/bli_gemm_ukr_ref.c @@ -122,5 +122,5 @@ void PASTEMAC(ch,varname)( \ } \ } -INSERT_GENTFUNC_BASIC0( gemm_ref_mxn ) +INSERT_GENTFUNC_BASIC0( gemm_ukr_ref ) diff --git a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h b/frame/3/gemm/ukernels/bli_gemm_ukr_ref.h similarity index 98% rename from frame/3/gemm/ukernels/bli_gemm_ref_mxn.h rename to frame/3/gemm/ukernels/bli_gemm_ukr_ref.h index cad431ee8..797439b09 100644 --- a/frame/3/gemm/ukernels/bli_gemm_ref_mxn.h +++ b/frame/3/gemm/ukernels/bli_gemm_ukr_ref.h @@ -49,5 +49,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemm_ref_mxn ) +INSERT_GENTPROT_BASIC( gemm_ukr_ref ) diff --git a/frame/3/hemm/3m/bli_hemm3m.c b/frame/3/hemm/3m/bli_hemm3m.c new file mode 100644 index 000000000..f7a96ea7a --- /dev/null +++ b/frame/3/hemm/3m/bli_hemm3m.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3m_cntl; + +// +// Define object-based interface. +// +void bli_hemm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, + gemm3m_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm3m, hemm3m ) + diff --git a/frame/3/hemm/3m/bli_hemm3m.h b/frame/3/hemm/3m/bli_hemm3m.h new file mode 100644 index 000000000..88e9fcdfb --- /dev/null +++ b/frame/3/hemm/3m/bli_hemm3m.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_hemm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm3m ) + diff --git a/frame/3/hemm/4m/bli_hemm4m.c b/frame/3/hemm/4m/bli_hemm4m.c new file mode 100644 index 000000000..1a54e2380 --- /dev/null +++ b/frame/3/hemm/4m/bli_hemm4m.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4m_cntl; + +// +// Define object-based interface. +// +void bli_hemm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_hemm_front( side, alpha, a, b, beta, c, + gemm4m_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( hemm4m, hemm4m ) + diff --git a/frame/3/hemm/4m/bli_hemm4m.h b/frame/3/hemm/4m/bli_hemm4m.h new file mode 100644 index 000000000..2fa4cdbc0 --- /dev/null +++ b/frame/3/hemm/4m/bli_hemm4m.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_hemm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( hemm4m ) + diff --git a/frame/3/hemm/bli_hemm.c b/frame/3/hemm/bli_hemm.c index 5516b222c..5cc32f6aa 100644 --- a/frame/3/hemm/bli_hemm.c +++ b/frame/3/hemm/bli_hemm.c @@ -46,6 +46,17 @@ void bli_hemm( side_t side, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_hemm4m( side, alpha, a, b, beta, c ); + bli_hemm_front( side, alpha, a, b, beta, c, gemm_cntl ); } diff --git a/frame/3/hemm/bli_hemm.h b/frame/3/hemm/bli_hemm.h index 7e94c9a5d..4fbd1de6f 100644 --- a/frame/3/hemm/bli_hemm.h +++ b/frame/3/hemm/bli_hemm.h @@ -35,6 +35,9 @@ #include "bli_hemm_check.h" #include "bli_hemm_front.h" +#include "bli_hemm4m.h" +#include "bli_hemm3m.h" + // // Prototype object-based interface. diff --git a/frame/3/her2k/3m/bli_her2k3m.c b/frame/3/her2k/3m/bli_her2k3m.c new file mode 100644 index 000000000..d8b31fbf3 --- /dev/null +++ b/frame/3/her2k/3m/bli_her2k3m.c @@ -0,0 +1,111 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk3m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_her2k3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl; + else cntl = herk_cntl; + + bli_her2k_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k3m, her2k3m ) + diff --git a/frame/3/her2k/3m/bli_her2k3m.h b/frame/3/her2k/3m/bli_her2k3m.h new file mode 100644 index 000000000..4a4dbd3cf --- /dev/null +++ b/frame/3/her2k/3m/bli_her2k3m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_her2k3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k3m ) + diff --git a/frame/3/her2k/4m/bli_her2k4m.c b/frame/3/her2k/4m/bli_her2k4m.c new file mode 100644 index 000000000..43aaaca05 --- /dev/null +++ b/frame/3/her2k/4m/bli_her2k4m.c @@ -0,0 +1,111 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk4m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_her2k4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl; + else cntl = herk_cntl; + + bli_her2k_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( her2k4m, her2k4m ) + diff --git a/frame/3/her2k/4m/bli_her2k4m.h b/frame/3/her2k/4m/bli_her2k4m.h new file mode 100644 index 000000000..eb5678ac7 --- /dev/null +++ b/frame/3/her2k/4m/bli_her2k4m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_her2k4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( her2k4m ) + diff --git a/frame/3/her2k/bli_her2k.c b/frame/3/her2k/bli_her2k.c index 700d4176c..8c8834b79 100644 --- a/frame/3/her2k/bli_her2k.c +++ b/frame/3/her2k/bli_her2k.c @@ -46,6 +46,17 @@ void bli_her2k( obj_t* alpha, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_her2k4m( alpha, a, b, beta, c ); + bli_her2k_front( alpha, a, b, beta, c, herk_cntl ); } diff --git a/frame/3/her2k/bli_her2k.h b/frame/3/her2k/bli_her2k.h index 3748e34fc..6c3b0d2c7 100644 --- a/frame/3/her2k/bli_her2k.h +++ b/frame/3/her2k/bli_her2k.h @@ -49,6 +49,9 @@ #include "bli_her2k_u_ker_var2.h" */ +#include "bli_her2k4m.h" +#include "bli_her2k3m.h" + // // Prototype object-based interface. diff --git a/frame/3/herk/3m/bli_herk3m.c b/frame/3/herk/3m/bli_herk3m.c new file mode 100644 index 000000000..62530faf8 --- /dev/null +++ b/frame/3/herk/3m/bli_herk3m.c @@ -0,0 +1,103 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk3m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_herk3m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl; + else cntl = herk_cntl; + + bli_herk_front( alpha, a, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk3m, herk3m ) + diff --git a/config/template/kernels/1/bli_dotv_opt_var1.h b/frame/3/herk/3m/bli_herk3m.h similarity index 67% rename from config/template/kernels/1/bli_dotv_opt_var1.h rename to frame/3/herk/3m/bli_herk3m.h index 488cd95b4..71f83f2bf 100644 --- a/config/template/kernels/1/bli_dotv_opt_var1.h +++ b/frame/3/herk/3m/bli_herk3m.h @@ -32,28 +32,34 @@ */ +#include "bli_herk3m_cntl.h" + // -// Prototype dotv kernel interfaces. +// Prototype object-based interface. // -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \ +void bli_herk3m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy, \ - ctype_r* restrict rho \ - ); +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); -INSERT_GENTPROT3_BASIC( dotv_opt_var1 ) +INSERT_GENTPROTR_BASIC( herk3m ) -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( dotv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( dotv_opt_var1 ) -#endif diff --git a/frame/3/herk/3m/bli_herk3m_cntl.c b/frame/3/herk/3m/bli_herk3m_cntl.c new file mode 100644 index 000000000..0dffc9820 --- /dev/null +++ b/frame/3/herk/3m/bli_herk3m_cntl.c @@ -0,0 +1,158 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm3m_mc; +extern blksz_t* gemm3m_nc; +extern blksz_t* gemm3m_kc; +extern blksz_t* gemm3m_mr; +extern blksz_t* gemm3m_nr; +extern blksz_t* gemm3m_kr; + +extern func_t* gemm3m_ukrs; + +packm_t* herk3m_packa_cntl; +packm_t* herk3m_packb_cntl; + +herk_t* herk3m_cntl_bp_ke; +herk_t* herk3m_cntl_op_bp; +herk_t* herk3m_cntl_mm_op; +herk_t* herk3m_cntl_vl_mm; + +herk_t* herk3m_cntl; + + +void bli_herk3m_cntl_init() +{ + // Create control tree objects for packm operations. + herk3m_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_mr, + gemm3m_kr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + herk3m_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kr, + gemm3m_nr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + herk3m_cntl_bp_ke + = + bli_herk_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3m_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem. + herk3m_cntl_op_bp + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, + NULL, + herk3m_packa_cntl, + herk3m_packb_cntl, + NULL, + herk3m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. + herk3m_cntl_mm_op + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + herk3m_cntl_op_bp, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. + herk3m_cntl_vl_mm + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + herk3m_cntl_mm_op, + NULL ); + + // Alias the "master" herk control tree to a shorter name. + herk3m_cntl = herk3m_cntl_vl_mm; +} + +void bli_herk3m_cntl_finalize() +{ + bli_cntl_obj_free( herk3m_packa_cntl ); + bli_cntl_obj_free( herk3m_packb_cntl ); + + bli_cntl_obj_free( herk3m_cntl_bp_ke ); + bli_cntl_obj_free( herk3m_cntl_op_bp ); + bli_cntl_obj_free( herk3m_cntl_mm_op ); + bli_cntl_obj_free( herk3m_cntl_vl_mm ); +} + diff --git a/frame/3/herk/3m/bli_herk3m_cntl.h b/frame/3/herk/3m/bli_herk3m_cntl.h new file mode 100644 index 000000000..91ea940f1 --- /dev/null +++ b/frame/3/herk/3m/bli_herk3m_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk3m_cntl_init( void ); +void bli_herk3m_cntl_finalize( void ); + diff --git a/frame/3/herk/4m/bli_herk4m.c b/frame/3/herk/4m/bli_herk4m.c new file mode 100644 index 000000000..33868cbc4 --- /dev/null +++ b/frame/3/herk/4m/bli_herk4m.c @@ -0,0 +1,103 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk4m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_herk4m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl; + else cntl = herk_cntl; + + bli_herk_front( alpha, a, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt_r = PASTEMAC(chr,type); \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_HERMITIAN, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNCR_BASIC( herk4m, herk4m ) + diff --git a/frame/3/herk/4m/bli_herk4m.h b/frame/3/herk/4m/bli_herk4m.h new file mode 100644 index 000000000..d77d3353f --- /dev/null +++ b/frame/3/herk/4m/bli_herk4m.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_herk4m_cntl.h" + + +// +// Prototype object-based interface. +// +void bli_herk4m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROTR +#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype_r* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype_r* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROTR_BASIC( herk4m ) + diff --git a/frame/3/herk/4m/bli_herk4m_cntl.c b/frame/3/herk/4m/bli_herk4m_cntl.c new file mode 100644 index 000000000..26de5aab9 --- /dev/null +++ b/frame/3/herk/4m/bli_herk4m_cntl.c @@ -0,0 +1,158 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm4m_mc; +extern blksz_t* gemm4m_nc; +extern blksz_t* gemm4m_kc; +extern blksz_t* gemm4m_mr; +extern blksz_t* gemm4m_nr; +extern blksz_t* gemm4m_kr; + +extern func_t* gemm4m_ukrs; + +packm_t* herk4m_packa_cntl; +packm_t* herk4m_packb_cntl; + +herk_t* herk4m_cntl_bp_ke; +herk_t* herk4m_cntl_op_bp; +herk_t* herk4m_cntl_mm_op; +herk_t* herk4m_cntl_vl_mm; + +herk_t* herk4m_cntl; + + +void bli_herk4m_cntl_init() +{ + // Create control tree objects for packm operations. + herk4m_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_mr, + gemm4m_kr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + herk4m_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_kr, + gemm4m_nr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + herk4m_cntl_bp_ke + = + bli_herk_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4m_ukrs, + NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem. + herk4m_cntl_op_bp + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, + NULL, + herk4m_packa_cntl, + herk4m_packb_cntl, + NULL, + herk4m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates. + herk4m_cntl_mm_op + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + herk4m_cntl_op_bp, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems. + herk4m_cntl_vl_mm + = + bli_herk_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + herk4m_cntl_mm_op, + NULL ); + + // Alias the "master" herk control tree to a shorter name. + herk4m_cntl = herk4m_cntl_vl_mm; +} + +void bli_herk4m_cntl_finalize() +{ + bli_cntl_obj_free( herk4m_packa_cntl ); + bli_cntl_obj_free( herk4m_packb_cntl ); + + bli_cntl_obj_free( herk4m_cntl_bp_ke ); + bli_cntl_obj_free( herk4m_cntl_op_bp ); + bli_cntl_obj_free( herk4m_cntl_mm_op ); + bli_cntl_obj_free( herk4m_cntl_vl_mm ); +} + diff --git a/frame/3/herk/4m/bli_herk4m_cntl.h b/frame/3/herk/4m/bli_herk4m_cntl.h new file mode 100644 index 000000000..c45c5ff19 --- /dev/null +++ b/frame/3/herk/4m/bli_herk4m_cntl.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_herk4m_cntl_init( void ); +void bli_herk4m_cntl_finalize( void ); + diff --git a/frame/3/herk/bli_herk.c b/frame/3/herk/bli_herk.c index 4078ebebf..5a089a9b3 100644 --- a/frame/3/herk/bli_herk.c +++ b/frame/3/herk/bli_herk.c @@ -44,6 +44,17 @@ void bli_herk( obj_t* alpha, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_herk4m( alpha, a, beta, c ); + bli_herk_front( alpha, a, beta, c, herk_cntl ); } diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index f0024d29d..b010fd037 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -47,6 +47,9 @@ #include "bli_herk_l_ker_var2.h" #include "bli_herk_u_ker_var2.h" +#include "bli_herk4m.h" +#include "bli_herk3m.h" + // // Prototype object-based interface. diff --git a/frame/3/symm/3m/bli_symm3m.c b/frame/3/symm/3m/bli_symm3m.c new file mode 100644 index 000000000..5d170f8b6 --- /dev/null +++ b/frame/3/symm/3m/bli_symm3m.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm3m_cntl; + +// +// Define object-based interface. +// +void bli_symm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, + gemm3m_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm3m, symm3m ) + diff --git a/frame/3/symm/3m/bli_symm3m.h b/frame/3/symm/3m/bli_symm3m.h new file mode 100644 index 000000000..cd5dcd4c9 --- /dev/null +++ b/frame/3/symm/3m/bli_symm3m.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_symm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm3m ) + diff --git a/frame/3/symm/4m/bli_symm4m.c b/frame/3/symm/4m/bli_symm4m.c new file mode 100644 index 000000000..8525d4e2f --- /dev/null +++ b/frame/3/symm/4m/bli_symm4m.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern gemm_t* gemm4m_cntl; + +// +// Define object-based interface. +// +void bli_symm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + bli_symm_front( side, alpha, a, b, beta, c, + gemm4m_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_conj( conja, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( symm4m, symm4m ) + diff --git a/frame/3/symm/4m/bli_symm4m.h b/frame/3/symm/4m/bli_symm4m.h new file mode 100644 index 000000000..0cd0dc091 --- /dev/null +++ b/frame/3/symm/4m/bli_symm4m.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_symm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + conj_t conja, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( symm4m ) + diff --git a/frame/3/symm/bli_symm.c b/frame/3/symm/bli_symm.c index a70f22d34..000d5de3a 100644 --- a/frame/3/symm/bli_symm.c +++ b/frame/3/symm/bli_symm.c @@ -46,6 +46,17 @@ void bli_symm( side_t side, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_symm4m( side, alpha, a, b, beta, c ); + bli_symm_front( side, alpha, a, b, beta, c, gemm_cntl ); } diff --git a/frame/3/symm/bli_symm.h b/frame/3/symm/bli_symm.h index 5f9d7d558..d5165e5b2 100644 --- a/frame/3/symm/bli_symm.h +++ b/frame/3/symm/bli_symm.h @@ -35,6 +35,9 @@ #include "bli_symm_check.h" #include "bli_symm_front.h" +#include "bli_symm4m.h" +#include "bli_symm3m.h" + // // Prototype object-based interface. diff --git a/frame/3/syr2k/3m/bli_syr2k3m.c b/frame/3/syr2k/3m/bli_syr2k3m.c new file mode 100644 index 000000000..e5acf56c0 --- /dev/null +++ b/frame/3/syr2k/3m/bli_syr2k3m.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk3m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_syr2k3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl; + else cntl = herk_cntl; + + bli_syr2k_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k3m, syr2k3m ) + diff --git a/frame/3/syr2k/3m/bli_syr2k3m.h b/frame/3/syr2k/3m/bli_syr2k3m.h new file mode 100644 index 000000000..cdea4f196 --- /dev/null +++ b/frame/3/syr2k/3m/bli_syr2k3m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_syr2k3m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k3m ) + diff --git a/frame/3/syr2k/4m/bli_syr2k4m.c b/frame/3/syr2k/4m/bli_syr2k4m.c new file mode 100644 index 000000000..ebc1e453b --- /dev/null +++ b/frame/3/syr2k/4m/bli_syr2k4m.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk4m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_syr2k4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl; + else cntl = herk_cntl; + + bli_syr2k_front( alpha, a, b, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t m_a, n_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ + bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syr2k4m, syr2k4m ) + diff --git a/frame/3/syr2k/4m/bli_syr2k4m.h b/frame/3/syr2k/4m/bli_syr2k4m.h new file mode 100644 index 000000000..45df1a95e --- /dev/null +++ b/frame/3/syr2k/4m/bli_syr2k4m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_syr2k4m( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + trans_t transb, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syr2k4m ) + diff --git a/frame/3/syr2k/bli_syr2k.c b/frame/3/syr2k/bli_syr2k.c index ddd853a2e..0f88529a9 100644 --- a/frame/3/syr2k/bli_syr2k.c +++ b/frame/3/syr2k/bli_syr2k.c @@ -46,6 +46,17 @@ void bli_syr2k( obj_t* alpha, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_syr2k4m( alpha, a, b, beta, c ); + bli_syr2k_front( alpha, a, b, beta, c, herk_cntl ); } diff --git a/frame/3/syr2k/bli_syr2k.h b/frame/3/syr2k/bli_syr2k.h index dd12c1905..7d48b79a2 100644 --- a/frame/3/syr2k/bli_syr2k.h +++ b/frame/3/syr2k/bli_syr2k.h @@ -35,6 +35,9 @@ #include "bli_syr2k_check.h" #include "bli_syr2k_front.h" +#include "bli_syr2k4m.h" +#include "bli_syr2k3m.h" + // // Prototype object-based interface. diff --git a/frame/3/syrk/3m/bli_syrk3m.c b/frame/3/syrk/3m/bli_syrk3m.c new file mode 100644 index 000000000..7f672deb2 --- /dev/null +++ b/frame/3/syrk/3m/bli_syrk3m.c @@ -0,0 +1,102 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk3m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_syrk3m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl; + else cntl = herk_cntl; + + bli_syrk_front( alpha, a, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk3m, syrk3m ) + diff --git a/frame/3/syrk/3m/bli_syrk3m.h b/frame/3/syrk/3m/bli_syrk3m.h new file mode 100644 index 000000000..ba5eaa6b5 --- /dev/null +++ b/frame/3/syrk/3m/bli_syrk3m.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_syrk3m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk3m ) + diff --git a/frame/3/syrk/4m/bli_syrk4m.c b/frame/3/syrk/4m/bli_syrk4m.c new file mode 100644 index 000000000..a311bafd1 --- /dev/null +++ b/frame/3/syrk/4m/bli_syrk4m.c @@ -0,0 +1,102 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern herk_t* herk4m_cntl; +extern herk_t* herk_cntl; + +// +// Define object-based interface. +// +void bli_syrk4m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ) +{ + herk_t* cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl; + else cntl = herk_cntl; + + bli_syrk_front( alpha, a, beta, c, + cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, betao, co; \ +\ + dim_t m_a, n_a; \ +\ + bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploc, co ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_SYMMETRIC, co ); \ +\ + PASTEMAC0(opname)( &alphao, \ + &ao, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( syrk4m, syrk4m ) + diff --git a/frame/3/syrk/4m/bli_syrk4m.h b/frame/3/syrk/4m/bli_syrk4m.h new file mode 100644 index 000000000..75a8fcae5 --- /dev/null +++ b/frame/3/syrk/4m/bli_syrk4m.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_syrk4m( obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + uplo_t uploc, \ + trans_t transa, \ + dim_t m, \ + dim_t k, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( syrk4m ) + diff --git a/frame/3/syrk/bli_syrk.c b/frame/3/syrk/bli_syrk.c index ec098087a..ec474b102 100644 --- a/frame/3/syrk/bli_syrk.c +++ b/frame/3/syrk/bli_syrk.c @@ -44,6 +44,17 @@ void bli_syrk( obj_t* alpha, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_syrk4m( alpha, a, beta, c ); + bli_syrk_front( alpha, a, beta, c, herk_cntl ); } diff --git a/frame/3/syrk/bli_syrk.h b/frame/3/syrk/bli_syrk.h index 864d37648..2db0fa09f 100644 --- a/frame/3/syrk/bli_syrk.h +++ b/frame/3/syrk/bli_syrk.h @@ -35,6 +35,9 @@ #include "bli_syrk_check.h" #include "bli_syrk_front.h" +#include "bli_syrk4m.h" +#include "bli_syrk3m.h" + // // Prototype object-based interface. diff --git a/frame/3/trmm/3m/bli_trmm3m.c b/frame/3/trmm/3m/bli_trmm3m.c new file mode 100644 index 000000000..62d06b7a1 --- /dev/null +++ b/frame/3/trmm/3m/bli_trmm3m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trmm_t* trmm3m_l_cntl; +extern trmm_t* trmm3m_r_cntl; +extern trmm_t* trmm_l_cntl; +extern trmm_t* trmm_r_cntl; + +// +// Define object-based interface. +// +void bli_trmm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ) +{ + trmm_t* l_cntl; + trmm_t* r_cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *b ) ) { l_cntl = trmm3m_l_cntl; r_cntl = trmm3m_r_cntl; } + else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; } + + bli_trmm_front( side, alpha, a, b, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo; \ +\ + dim_t mn_a; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo ); \ +} + +INSERT_GENTFUNC_BASIC( trmm3m, trmm3m ) + diff --git a/frame/3/trmm/3m/bli_trmm3m.h b/frame/3/trmm/3m/bli_trmm3m.h new file mode 100644 index 000000000..846140b3f --- /dev/null +++ b/frame/3/trmm/3m/bli_trmm3m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm3m_cntl.h" + + +// +// Prototype object-based interface. +// +void bli_trmm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT_BASIC( trmm3m ) + diff --git a/frame/3/trmm/3m/bli_trmm3m_cntl.c b/frame/3/trmm/3m/bli_trmm3m_cntl.c new file mode 100644 index 000000000..d5291f08e --- /dev/null +++ b/frame/3/trmm/3m/bli_trmm3m_cntl.c @@ -0,0 +1,262 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm3m_mc; +extern blksz_t* gemm3m_nc; +extern blksz_t* gemm3m_kc; +extern blksz_t* gemm3m_mr; +extern blksz_t* gemm3m_nr; +extern blksz_t* gemm3m_kr; + +extern func_t* gemm3m_ukrs; + +extern gemm_t* gemm3m_cntl_bp_ke; + +packm_t* trmm3m_l_packa_cntl; +packm_t* trmm3m_l_packb_cntl; + +packm_t* trmm3m_r_packa_cntl; +packm_t* trmm3m_r_packb_cntl; + +trmm_t* trmm3m_cntl_bp_ke; + +trmm_t* trmm3m_l_cntl_op_bp; +trmm_t* trmm3m_l_cntl_mm_op; +trmm_t* trmm3m_l_cntl_vl_mm; + +trmm_t* trmm3m_r_cntl_op_bp; +trmm_t* trmm3m_r_cntl_mm_op; +trmm_t* trmm3m_r_cntl_vl_mm; + +trmm_t* trmm3m_l_cntl; +trmm_t* trmm3m_r_cntl; + + +void bli_trmm3m_cntl_init() +{ + // Create control tree objects for packm operations (left side). + trmm3m_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to mr. + gemm3m_mr, + gemm3m_mr, + TRUE, // densify + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trmm3m_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: m dim multiple here must be mr + // since "k" dim multiple is set to mr above. + gemm3m_mr, + gemm3m_nr, + FALSE, // already dense + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trmm3m_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to nr. + gemm3m_mr, + gemm3m_nr, + FALSE, // already dense + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trmm3m_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: m dim multiple here must be nr + // since "k" dim multiple is set to nr above. + gemm3m_nr, + gemm3m_nr, + TRUE, // densify + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trmm3m_cntl_bp_ke + = + bli_trmm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3m_ukrs, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trmm3m_l_cntl_op_bp + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, + NULL, + trmm3m_l_packa_cntl, + trmm3m_l_packb_cntl, + NULL, + trmm3m_cntl_bp_ke, + gemm3m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trmm3m_l_cntl_mm_op + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm3m_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trmm3m_l_cntl_vl_mm + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm3m_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trmm3m_r_cntl_op_bp + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, + NULL, + trmm3m_r_packa_cntl, + trmm3m_r_packb_cntl, + NULL, + trmm3m_cntl_bp_ke, + gemm3m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trmm3m_r_cntl_mm_op + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm3m_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trmm3m_r_cntl_vl_mm + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm3m_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trmm control trees to shorter names. + trmm3m_l_cntl = trmm3m_l_cntl_vl_mm; + trmm3m_r_cntl = trmm3m_r_cntl_vl_mm; +} + +void bli_trmm3m_cntl_finalize() +{ + bli_cntl_obj_free( trmm3m_l_packa_cntl ); + bli_cntl_obj_free( trmm3m_l_packb_cntl ); + bli_cntl_obj_free( trmm3m_r_packa_cntl ); + bli_cntl_obj_free( trmm3m_r_packb_cntl ); + + bli_cntl_obj_free( trmm3m_cntl_bp_ke ); + + bli_cntl_obj_free( trmm3m_l_cntl_op_bp ); + bli_cntl_obj_free( trmm3m_l_cntl_mm_op ); + bli_cntl_obj_free( trmm3m_l_cntl_vl_mm ); + bli_cntl_obj_free( trmm3m_r_cntl_op_bp ); + bli_cntl_obj_free( trmm3m_r_cntl_mm_op ); + bli_cntl_obj_free( trmm3m_r_cntl_vl_mm ); +} + diff --git a/frame/3/trmm/3m/bli_trmm3m_cntl.h b/frame/3/trmm/3m/bli_trmm3m_cntl.h new file mode 100644 index 000000000..eac804f32 --- /dev/null +++ b/frame/3/trmm/3m/bli_trmm3m_cntl.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm3m_cntl_init( void ); +void bli_trmm3m_cntl_finalize( void ); diff --git a/frame/3/trmm/4m/bli_trmm4m.c b/frame/3/trmm/4m/bli_trmm4m.c new file mode 100644 index 000000000..64586ec67 --- /dev/null +++ b/frame/3/trmm/4m/bli_trmm4m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trmm_t* trmm4m_l_cntl; +extern trmm_t* trmm4m_r_cntl; +extern trmm_t* trmm_l_cntl; +extern trmm_t* trmm_r_cntl; + +// +// Define object-based interface. +// +void bli_trmm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ) +{ + trmm_t* l_cntl; + trmm_t* r_cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *b ) ) { l_cntl = trmm4m_l_cntl; r_cntl = trmm4m_r_cntl; } + else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; } + + bli_trmm_front( side, alpha, a, b, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo; \ +\ + dim_t mn_a; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo ); \ +} + +INSERT_GENTFUNC_BASIC( trmm4m, trmm4m ) + diff --git a/frame/3/trmm/4m/bli_trmm4m.h b/frame/3/trmm/4m/bli_trmm4m.h new file mode 100644 index 000000000..1f86b0ea6 --- /dev/null +++ b/frame/3/trmm/4m/bli_trmm4m.h @@ -0,0 +1,66 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trmm4m_cntl.h" + + +// +// Prototype object-based interface. +// +void bli_trmm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT_BASIC( trmm4m ) + diff --git a/frame/3/trmm/4m/bli_trmm4m_cntl.c b/frame/3/trmm/4m/bli_trmm4m_cntl.c new file mode 100644 index 000000000..3f98bccfe --- /dev/null +++ b/frame/3/trmm/4m/bli_trmm4m_cntl.c @@ -0,0 +1,262 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm4m_mc; +extern blksz_t* gemm4m_nc; +extern blksz_t* gemm4m_kc; +extern blksz_t* gemm4m_mr; +extern blksz_t* gemm4m_nr; +extern blksz_t* gemm4m_kr; + +extern func_t* gemm4m_ukrs; + +extern gemm_t* gemm4m_cntl_bp_ke; + +packm_t* trmm4m_l_packa_cntl; +packm_t* trmm4m_l_packb_cntl; + +packm_t* trmm4m_r_packa_cntl; +packm_t* trmm4m_r_packb_cntl; + +trmm_t* trmm4m_cntl_bp_ke; + +trmm_t* trmm4m_l_cntl_op_bp; +trmm_t* trmm4m_l_cntl_mm_op; +trmm_t* trmm4m_l_cntl_vl_mm; + +trmm_t* trmm4m_r_cntl_op_bp; +trmm_t* trmm4m_r_cntl_mm_op; +trmm_t* trmm4m_r_cntl_vl_mm; + +trmm_t* trmm4m_l_cntl; +trmm_t* trmm4m_r_cntl; + + +void bli_trmm4m_cntl_init() +{ + // Create control tree objects for packm operations (left side). + trmm4m_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to mr. + gemm4m_mr, + gemm4m_mr, + TRUE, // densify + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trmm4m_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: m dim multiple here must be mr + // since "k" dim multiple is set to mr above. + gemm4m_mr, + gemm4m_nr, + FALSE, // already dense + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trmm4m_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: for consistency with trsm, "k" dim + // multiple is set to nr. + gemm4m_mr, + gemm4m_nr, + FALSE, // already dense + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trmm4m_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: m dim multiple here must be nr + // since "k" dim multiple is set to nr above. + gemm4m_nr, + gemm4m_nr, + TRUE, // densify + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trmm4m_cntl_bp_ke + = + bli_trmm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4m_ukrs, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trmm4m_l_cntl_op_bp + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, + NULL, + trmm4m_l_packa_cntl, + trmm4m_l_packb_cntl, + NULL, + trmm4m_cntl_bp_ke, + gemm4m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trmm4m_l_cntl_mm_op + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm4m_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trmm4m_l_cntl_vl_mm + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm4m_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trmm4m_r_cntl_op_bp + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, + NULL, + trmm4m_r_packa_cntl, + trmm4m_r_packb_cntl, + NULL, + trmm4m_cntl_bp_ke, + gemm4m_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trmm4m_r_cntl_mm_op + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm4m_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trmm4m_r_cntl_vl_mm + = + bli_trmm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + trmm4m_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trmm control trees to shorter names. + trmm4m_l_cntl = trmm4m_l_cntl_vl_mm; + trmm4m_r_cntl = trmm4m_r_cntl_vl_mm; +} + +void bli_trmm4m_cntl_finalize() +{ + bli_cntl_obj_free( trmm4m_l_packa_cntl ); + bli_cntl_obj_free( trmm4m_l_packb_cntl ); + bli_cntl_obj_free( trmm4m_r_packa_cntl ); + bli_cntl_obj_free( trmm4m_r_packb_cntl ); + + bli_cntl_obj_free( trmm4m_cntl_bp_ke ); + + bli_cntl_obj_free( trmm4m_l_cntl_op_bp ); + bli_cntl_obj_free( trmm4m_l_cntl_mm_op ); + bli_cntl_obj_free( trmm4m_l_cntl_vl_mm ); + bli_cntl_obj_free( trmm4m_r_cntl_op_bp ); + bli_cntl_obj_free( trmm4m_r_cntl_mm_op ); + bli_cntl_obj_free( trmm4m_r_cntl_vl_mm ); +} + diff --git a/frame/3/trmm/4m/bli_trmm4m_cntl.h b/frame/3/trmm/4m/bli_trmm4m_cntl.h new file mode 100644 index 000000000..68f7fbff2 --- /dev/null +++ b/frame/3/trmm/4m/bli_trmm4m_cntl.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trmm4m_cntl_init( void ); +void bli_trmm4m_cntl_finalize( void ); diff --git a/frame/3/trmm/bli_trmm.c b/frame/3/trmm/bli_trmm.c index 9a97acbd0..b44a1c061 100644 --- a/frame/3/trmm/bli_trmm.c +++ b/frame/3/trmm/bli_trmm.c @@ -45,7 +45,17 @@ void bli_trmm( side_t side, obj_t* a, obj_t* b ) { - // Invoke the internal back-end. + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *b ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *b ) || +#endif + FALSE + ) + return bli_trmm4m( side, alpha, a, b ); + bli_trmm_front( side, alpha, a, b, trmm_l_cntl, trmm_r_cntl ); diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index c53893245..f64a89b28 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -51,6 +51,9 @@ #include "bli_trmm_rl_ker_var2.h" #include "bli_trmm_ru_ker_var2.h" +#include "bli_trmm4m.h" +#include "bli_trmm3m.h" + // // Prototype object-based interface. diff --git a/frame/3/trmm/bli_trmm_cntl.c b/frame/3/trmm/bli_trmm_cntl.c index a6f1acc5d..a1a70ee22 100644 --- a/frame/3/trmm/bli_trmm_cntl.c +++ b/frame/3/trmm/bli_trmm_cntl.c @@ -73,7 +73,7 @@ void bli_trmm_cntl_init() trmm_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of A compactly + BLIS_VARIANT1, // IMPORTANT: for consistency with trsm, "k" dim // multiple is set to mr. gemm_mr, @@ -119,7 +119,7 @@ void bli_trmm_cntl_init() trmm_r_packb_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of B compactly + BLIS_VARIANT1, // IMPORTANT: m dim multiple here must be nr // since "k" dim multiple is set to nr above. gemm_nr, diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index a808d423b..01fc281ee 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -105,6 +105,16 @@ void bli_trmm_ll_ker_var2( obj_t* a, // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. @@ -154,7 +164,6 @@ void PASTEMAC(ch,varname)( \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ ctype* restrict one = PASTEMAC(ch,1); \ @@ -168,6 +177,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -178,6 +188,7 @@ void PASTEMAC(ch,varname)( \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ + inc_t ss_a; \ auxinfo_t aux; \ \ /* @@ -200,6 +211,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current block of A is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix A, which is + usually PACKMR. However, in the case of 3m, the storage stride + captures the (PACKMR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of MR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ + ss_a = ps_a / k_full; \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as @@ -290,7 +310,7 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1011 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * PACKMR; \ + a2 = a1 + k_a1011 * ss_a; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ @@ -306,7 +326,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of A to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1011 * PACKMR, aux ); \ + bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -342,7 +362,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1011 * PACKMR; \ + a1 += k_a1011 * ss_a; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index c914bfe6e..867809da0 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -105,6 +105,16 @@ void bli_trmm_lu_ker_var2( obj_t* a, // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. @@ -154,7 +164,6 @@ void PASTEMAC(ch,varname)( \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ - const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ ctype* restrict one = PASTEMAC(ch,1); \ @@ -168,6 +177,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -178,6 +188,7 @@ void PASTEMAC(ch,varname)( \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ + inc_t ss_a; \ auxinfo_t aux; \ \ /* @@ -200,6 +211,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current block of A is entirely below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix A, which is + usually PACKMR. However, in the case of 3m, the storage stride + captures the (PACKMR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of MR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ + ss_a = ps_a / k_full; \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and @@ -298,7 +318,7 @@ void PASTEMAC(ch,varname)( \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1112 * PACKMR; \ + a2 = a1 + k_a1112 * ss_a; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ @@ -314,7 +334,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of A to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1112 * PACKMR, aux ); \ + bli_auxinfo_set_ps_a( k_a1112 * ss_a, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ @@ -350,7 +370,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1112 * PACKMR; \ + a1 += k_a1112 * ss_a; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index ae628ee60..ae4b4b1d2 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -105,6 +105,16 @@ void bli_trmm_rl_ker_var2( obj_t* a, // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. @@ -155,7 +165,6 @@ void PASTEMAC(ch,varname)( \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ @@ -168,6 +177,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -178,6 +188,7 @@ void PASTEMAC(ch,varname)( \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ + inc_t ss_b; \ auxinfo_t aux; \ \ /* @@ -200,6 +211,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current panel of B is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix B, which is + usually PACKNR. However, in the case of 3m, the storage stride + captures the (PACKNR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of NR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ + ss_b = ps_b / k_full; \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this @@ -282,7 +302,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of B to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_b( k_b1121 * PACKNR, aux ); \ + bli_auxinfo_set_ps_b( k_b1121 * ss_b, aux ); \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. @@ -305,7 +325,7 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b1121 * PACKNR; \ + b2 = b1 + k_b1121 * ss_b; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -411,7 +431,7 @@ void PASTEMAC(ch,varname)( \ } \ } \ \ - b1 += k_b1121 * PACKNR; \ + b1 += k_b1121 * ss_b; \ c1 += cstep_c; \ } \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index e199004b5..57d112ce5 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -105,6 +105,16 @@ void bli_trmm_ru_ker_var2( obj_t* a, // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. @@ -155,7 +165,6 @@ void PASTEMAC(ch,varname)( \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ - const dim_t PACKNR = rs_b; \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ @@ -168,6 +177,7 @@ void PASTEMAC(ch,varname)( \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -178,6 +188,7 @@ void PASTEMAC(ch,varname)( \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ + inc_t ss_b; \ auxinfo_t aux; \ \ /* @@ -200,6 +211,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix B, which is + usually PACKNR. However, in the case of 3m, the storage stride + captures the (PACKNR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of NR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ + ss_b = ps_b / k_full; \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and @@ -282,7 +302,7 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of B to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_b( k_b0111 * PACKNR, aux ); \ + bli_auxinfo_set_ps_b( k_b0111 * ss_b, aux ); \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. @@ -305,7 +325,7 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b0111 * PACKNR; \ + b2 = b1 + k_b0111 * ss_b; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -411,7 +431,7 @@ void PASTEMAC(ch,varname)( \ } \ } \ \ - b1 += k_b0111 * PACKNR; \ + b1 += k_b0111 * ss_b; \ c1 += cstep_c; \ } \ \ diff --git a/frame/3/trmm3/3m/bli_trmm33m.c b/frame/3/trmm3/3m/bli_trmm33m.c new file mode 100644 index 000000000..6aa0bc59b --- /dev/null +++ b/frame/3/trmm3/3m/bli_trmm33m.c @@ -0,0 +1,119 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trmm_t* trmm3m_l_cntl; +extern trmm_t* trmm3m_r_cntl; +extern trmm_t* trmm_l_cntl; +extern trmm_t* trmm_r_cntl; + +// +// Define object-based interface. +// +void bli_trmm33m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + trmm_t* l_cntl; + trmm_t* r_cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) { l_cntl = trmm3m_l_cntl; r_cntl = trmm3m_r_cntl; } + else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; } + + bli_trmm3_front( side, alpha, a, b, beta, c, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm33m, trmm33m ) + diff --git a/frame/3/trmm3/3m/bli_trmm33m.h b/frame/3/trmm3/3m/bli_trmm33m.h new file mode 100644 index 000000000..1038aa00c --- /dev/null +++ b/frame/3/trmm3/3m/bli_trmm33m.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_trmm33m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm33m ) + diff --git a/frame/3/trmm3/4m/bli_trmm34m.c b/frame/3/trmm3/4m/bli_trmm34m.c new file mode 100644 index 000000000..e9d6c54f5 --- /dev/null +++ b/frame/3/trmm3/4m/bli_trmm34m.c @@ -0,0 +1,119 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trmm_t* trmm4m_l_cntl; +extern trmm_t* trmm4m_r_cntl; +extern trmm_t* trmm_l_cntl; +extern trmm_t* trmm_r_cntl; + +// +// Define object-based interface. +// +void bli_trmm34m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ) +{ + trmm_t* l_cntl; + trmm_t* r_cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *c ) ) { l_cntl = trmm4m_l_cntl; r_cntl = trmm4m_r_cntl; } + else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; } + + bli_trmm3_front( side, alpha, a, b, beta, c, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo, betao, co; \ +\ + dim_t mn_a; \ + dim_t m_b, n_b; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ + bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ + bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ + bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ + bli_obj_set_conjtrans( transb, bo ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo, \ + &betao, \ + &co ); \ +} + +INSERT_GENTFUNC_BASIC( trmm34m, trmm34m ) + diff --git a/frame/3/trmm3/4m/bli_trmm34m.h b/frame/3/trmm3/4m/bli_trmm34m.h new file mode 100644 index 000000000..469620296 --- /dev/null +++ b/frame/3/trmm3/4m/bli_trmm34m.h @@ -0,0 +1,69 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_trmm34m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + trans_t transb, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b, \ + ctype* beta, \ + ctype* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( trmm34m ) + diff --git a/frame/3/trmm3/bli_trmm3.c b/frame/3/trmm3/bli_trmm3.c index db58325db..9a915fd53 100644 --- a/frame/3/trmm3/bli_trmm3.c +++ b/frame/3/trmm3/bli_trmm3.c @@ -47,6 +47,17 @@ void bli_trmm3( side_t side, obj_t* beta, obj_t* c ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *c ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *c ) || +#endif + FALSE + ) + return bli_trmm34m( side, alpha, a, b, beta, c ); + bli_trmm3_front( side, alpha, a, b, beta, c, trmm_l_cntl, trmm_r_cntl ); diff --git a/frame/3/trmm3/bli_trmm3.h b/frame/3/trmm3/bli_trmm3.h index 483c76a26..dba409cb5 100644 --- a/frame/3/trmm3/bli_trmm3.h +++ b/frame/3/trmm3/bli_trmm3.h @@ -35,6 +35,9 @@ #include "bli_trmm3_check.h" #include "bli_trmm3_front.h" +#include "bli_trmm34m.h" +#include "bli_trmm33m.h" + // // Prototype object-based interface. diff --git a/frame/3/trsm/3m/bli_trsm3m.c b/frame/3/trsm/3m/bli_trsm3m.c new file mode 100644 index 000000000..307bef013 --- /dev/null +++ b/frame/3/trsm/3m/bli_trsm3m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trsm_t* trsm3m_l_cntl; +extern trsm_t* trsm3m_r_cntl; +extern trsm_t* trsm_l_cntl; +extern trsm_t* trsm_r_cntl; + +// +// Define object-based interface. +// +void bli_trsm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ) +{ + trsm_t* l_cntl; + trsm_t* r_cntl; + + // Since 3m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *b ) ) { l_cntl = trsm3m_l_cntl; r_cntl = trsm3m_r_cntl; } + else { l_cntl = trsm_l_cntl; r_cntl = trsm_r_cntl; } + + bli_trsm_front( side, alpha, a, b, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo; \ +\ + dim_t mn_a; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo ); \ +} + +INSERT_GENTFUNC_BASIC( trsm3m, trsm3m ) + diff --git a/frame/3/trsm/3m/bli_trsm3m.h b/frame/3/trsm/3m/bli_trsm3m.h new file mode 100644 index 000000000..81b7b5601 --- /dev/null +++ b/frame/3/trsm/3m/bli_trsm3m.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trsm3m_cntl.h" + +#include "bli_gemmtrsm3m_l_ukr_ref.h" +#include "bli_gemmtrsm3m_u_ukr_ref.h" + +#include "bli_trsm3m_l_ukr_ref.h" +#include "bli_trsm3m_u_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_trsm3m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT_BASIC( trsm3m ) + diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.c b/frame/3/trsm/3m/bli_trsm3m_cntl.c new file mode 100644 index 000000000..52ade0617 --- /dev/null +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.c @@ -0,0 +1,278 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm3m_mc; +extern blksz_t* gemm3m_nc; +extern blksz_t* gemm3m_kc; +extern blksz_t* gemm3m_mr; +extern blksz_t* gemm3m_nr; +extern blksz_t* gemm3m_kr; + +extern func_t* gemm3m_ukrs; + +func_t* gemmtrsm3m_l_ukrs; +func_t* gemmtrsm3m_u_ukrs; + +packm_t* trsm3m_l_packa_cntl; +packm_t* trsm3m_l_packb_cntl; + +packm_t* trsm3m_r_packa_cntl; +packm_t* trsm3m_r_packb_cntl; + +trsm_t* trsm3m_cntl_bp_ke; + +trsm_t* trsm3m_l_cntl_op_bp; +trsm_t* trsm3m_l_cntl_mm_op; +trsm_t* trsm3m_l_cntl_vl_mm; + +trsm_t* trsm3m_r_cntl_op_bp; +trsm_t* trsm3m_r_cntl_mm_op; +trsm_t* trsm3m_r_cntl_vl_mm; + +trsm_t* trsm3m_l_cntl; +trsm_t* trsm3m_r_cntl; + + +void bli_trsm3m_cntl_init() +{ + + // Create function pointer objects for each datatype-specific + // gemmtrsm3m_l and gemmtrsm3m_u micro-kernel. + gemmtrsm3m_l_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMMTRSM3M_L_UKERNEL, + BLIS_ZGEMMTRSM3M_L_UKERNEL ); + + gemmtrsm3m_u_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMMTRSM3M_U_UKERNEL, + BLIS_ZGEMMTRSM3M_U_UKERNEL ); + + + // Create control tree objects for packm operations (left side). + trsm3m_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + gemm3m_mr, + gemm3m_mr, + TRUE, // densify + TRUE, // invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm3m_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + gemm3m_mr, + gemm3m_nr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trsm3m_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_nr, + gemm3m_mr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_3M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm3m_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_mr, + gemm3m_mr, + TRUE, // densify + TRUE, // invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_3M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trsm3m_cntl_bp_ke + = + bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm3m_ukrs, + gemmtrsm3m_l_ukrs, + gemmtrsm3m_u_ukrs, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trsm3m_l_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, NULL, NULL, + NULL, + trsm3m_l_packa_cntl, + trsm3m_l_packb_cntl, + NULL, + trsm3m_cntl_bp_ke, + NULL, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trsm3m_l_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm3m_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trsm3m_l_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm3m_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trsm3m_r_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm3m_mc, + NULL, NULL, NULL, + NULL, + trsm3m_r_packa_cntl, + trsm3m_r_packb_cntl, + NULL, + trsm3m_cntl_bp_ke, + NULL, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trsm3m_r_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm3m_kc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm3m_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trsm3m_r_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm3m_nc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm3m_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trsm control trees to shorter names. + trsm3m_l_cntl = trsm3m_l_cntl_vl_mm; + trsm3m_r_cntl = trsm3m_r_cntl_vl_mm; +} + +void bli_trsm3m_cntl_finalize() +{ + bli_func_obj_free( gemmtrsm3m_l_ukrs ); + bli_func_obj_free( gemmtrsm3m_u_ukrs ); + + bli_cntl_obj_free( trsm3m_l_packa_cntl ); + bli_cntl_obj_free( trsm3m_l_packb_cntl ); + bli_cntl_obj_free( trsm3m_r_packa_cntl ); + bli_cntl_obj_free( trsm3m_r_packb_cntl ); + + bli_cntl_obj_free( trsm3m_cntl_bp_ke ); + + bli_cntl_obj_free( trsm3m_l_cntl_op_bp ); + bli_cntl_obj_free( trsm3m_l_cntl_mm_op ); + bli_cntl_obj_free( trsm3m_l_cntl_vl_mm ); + bli_cntl_obj_free( trsm3m_r_cntl_op_bp ); + bli_cntl_obj_free( trsm3m_r_cntl_mm_op ); + bli_cntl_obj_free( trsm3m_r_cntl_vl_mm ); +} + diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.h b/frame/3/trsm/3m/bli_trsm3m_cntl.h new file mode 100644 index 000000000..495ecaf41 --- /dev/null +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trsm3m_cntl_init( void ); +void bli_trsm3m_cntl_finalize( void ); diff --git a/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.c b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.c new file mode 100644 index 000000000..c32e2b527 --- /dev/null +++ b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.c @@ -0,0 +1,211 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr, trsmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict b01, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ab_r[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype_r ab_i[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = PASTEMAC(chr,mr); \ +\ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \ + const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \ +\ + ctype_r* restrict a10_r = ( ctype_r* )a10; \ + ctype_r* restrict a10_i = ( ctype_r* )a10 + ps_a; \ + ctype_r* restrict a10_ri = ( ctype_r* )a10 + 2*ps_a; \ +\ + ctype_r* restrict a11_r = ( ctype_r* )a11; \ +\ + ctype_r* restrict b01_r = ( ctype_r* )b01; \ + ctype_r* restrict b01_i = ( ctype_r* )b01 + ps_b; \ + ctype_r* restrict b01_ri = ( ctype_r* )b01 + 2*ps_b; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ps_b; \ + ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*ps_b; \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on alpha, but to do + so we have to manually scale b and then use alpha == 1 for the + micro-kernel calls. */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + PASTEMAC(ch,scalris)( *alpha_r, \ + *alpha_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Use alpha.r == 1.0. */ \ + alpha_r = one_r; \ + } \ +\ +\ + /* b11.r = alpha.r * b11.r - ( + a10.r * b01.r - a10.i * b01.i ); + b11.i = alpha.r * b11.i - ( a10.ri * b01.ri - a10.r * b01.r - a10.i * b01.i ); */ \ +\ + bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ +\ + /* ab.r = a10.r * b01.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a10_r, \ + b01_r, \ + zero_r, \ + ab_r, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a10_ri, b01_ri, *data ); \ +\ + /* ab.i = a10.i * b01.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a10_i, \ + b01_i, \ + zero_r, \ + ab_i, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* b11.i = alpha.r * b11.i - a10.ri * b01.ri; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a10_ri, \ + b01_ri, \ + alpha_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ +\ + /* b11.r = alpha.r * b11.r - ab.r; + b11.r = b11.r + ab.i; + b11.i = b11.i + ab.r; + b11.i = b11.i + ab.i; */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ + ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ + ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ + ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ +\ + PASTEMAC(chr,scals)( *alpha_r, beta11_r ); \ +\ + PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ + PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ + PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ + PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11_r, \ + beta11_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Update the ri part of b11. */ \ + PASTEMAC(chr,add3s)( beta11_r, \ + beta11_i, \ + *(b11_ri + i*rs_b + j*cs_b) ); \ + } \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + PASTEMAC(ch,trsmukr)( a11_r, \ + b11_r, \ + c11, rs_c, cs_c, \ + data ); \ +\ +\ +/* +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b11_r after", m, n, \ + b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b11_i after", m, n, \ + b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ +/* +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b01_r", k, n, \ + b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b01_i", k, n, \ + b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b11_r", m, n, \ + b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m_l_ukr: b11_i", m, n, \ + b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \ +*/ \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m_l_ukr_ref, GEMM_UKERNEL, TRSM3M_L_UKERNEL ) + diff --git a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.h b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.h similarity index 94% rename from kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.h rename to frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.h index 2f900672d..105891eb2 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.h +++ b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_l_ukr_ref.h @@ -33,8 +33,8 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -47,5 +47,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_d4x4 ) +INSERT_GENTPROTCO_BASIC( gemmtrsm3m_l_ukr_ref ) diff --git a/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.c b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.c new file mode 100644 index 000000000..60c3b609d --- /dev/null +++ b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.c @@ -0,0 +1,193 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr, trsmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict b21, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + ctype_r ab_r[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype_r ab_i[ PASTEMAC(chr,mr) * \ + PASTEMAC(chr,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ab = 1; \ + const inc_t cs_ab = PASTEMAC(chr,mr); \ +\ +\ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \ + const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \ +\ + ctype_r* restrict a11_r = ( ctype_r* )a11; \ +\ + ctype_r* restrict a12_r = ( ctype_r* )a12; \ + ctype_r* restrict a12_i = ( ctype_r* )a12 + ps_a; \ + ctype_r* restrict a12_ri = ( ctype_r* )a12 + 2*ps_a; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ps_b; \ + ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*ps_b; \ +\ + ctype_r* restrict b21_r = ( ctype_r* )b21; \ + ctype_r* restrict b21_i = ( ctype_r* )b21 + ps_b; \ + ctype_r* restrict b21_ri = ( ctype_r* )b21 + 2*ps_b; \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on alpha, but to do + so we have to manually scale b and then use alpha == 1 for the + micro-kernel calls. */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + PASTEMAC(ch,scalris)( *alpha_r, \ + *alpha_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Use alpha.r == 1.0. */ \ + alpha_r = one_r; \ + } \ +\ +\ + /* b11.r = alpha.r * b11.r - ( + a12.r * b21.r - a12.i * b21.i ); + b11.i = alpha.r * b11.i - ( a12.ri * b21.ri - a12.r * b21.r - a12.i * b21.i ); */ \ +\ + bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ +\ + /* ab.r = a12.r * b21.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a12_r, \ + b21_r, \ + zero_r, \ + ab_r, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a12_ri, b21_ri, *data ); \ +\ + /* ab.i = a12.i * b21.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a12_i, \ + b21_i, \ + zero_r, \ + ab_i, rs_ab, cs_ab, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* b11.i = alpha.r * b11.i - a12.ri * b21.ri; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a12_ri, \ + b21_ri, \ + alpha_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ +\ + /* b11.r = alpha.r * b11.r - ab.r; + b11.r = b11.r + ab.i; + b11.i = b11.i + ab.r; + b11.i = b11.i + ab.i; */ \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \ + ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \ + ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \ + ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \ +\ + PASTEMAC(chr,scals)( *alpha_r, beta11_r ); \ +\ + PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \ + PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \ + PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \ + PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(ch,copyris)( beta11_r, \ + beta11_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Update the ri part of b11. */ \ + PASTEMAC(chr,add3s)( beta11_r, \ + beta11_i, \ + *(b11_ri + i*rs_b + j*cs_b) ); \ + } \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + PASTEMAC(ch,trsmukr)( a11_r, \ + b11_r, \ + c11, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm3m_u_ukr_ref, GEMM_UKERNEL, TRSM3M_U_UKERNEL ) + diff --git a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.h b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.h similarity index 94% rename from kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.h rename to frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.h index a7ba87e2e..58cc7bdd6 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.h +++ b/frame/3/trsm/3m/ukernels/bli_gemmtrsm3m_u_ukr_ref.h @@ -33,8 +33,8 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -47,5 +47,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_d4x4 ) +INSERT_GENTPROTCO_BASIC( gemmtrsm3m_u_ukr_ref ) diff --git a/frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.c b/frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.c new file mode 100644 index 000000000..ccb3445d8 --- /dev/null +++ b/frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.c @@ -0,0 +1,150 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \ + const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \ +\ + ctype_r* restrict a_r = ( ctype_r* )ar; \ + ctype_r* restrict a_i = ( ctype_r* )ar + ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )br; \ + ctype_r* restrict b_i = ( ctype_r* )br + ps_b; \ + ctype_r* restrict b_ri = ( ctype_r* )br + 2*ps_b; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = PASTEMAC(chr,packmr); \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(chr,set0s)( rho11_r ); \ + PASTEMAC(chr,set0s)( rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ + ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ + ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ + ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ + PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ +\ + /* Update the ri part of the packed panel. */ \ + PASTEMAC(chr,add3s)( beta11c_r, \ + beta11c_i, \ + *beta11_ri ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm3m_l_ukr_ref ) + diff --git a/kernels/c99/3/bli_trsm_u_ref_4x4.h b/frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.h similarity index 83% rename from kernels/c99/3/bli_trsm_u_ref_4x4.h rename to frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.h index 6ae9e77e0..c5bc390ac 100644 --- a/kernels/c99/3/bli_trsm_u_ref_4x4.h +++ b/frame/3/trsm/3m/ukernels/bli_trsm3m_l_ukr_ref.h @@ -33,15 +33,15 @@ */ -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( trsm_u_ref_4x4 ) +INSERT_GENTPROTCO_BASIC( trsm3m_l_ukr_ref ) diff --git a/frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.c b/frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.c new file mode 100644 index 000000000..6133cfbad --- /dev/null +++ b/frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.c @@ -0,0 +1,150 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \ + const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \ +\ + ctype_r* restrict a_r = ( ctype_r* )ar; \ + ctype_r* restrict a_i = ( ctype_r* )ar + ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )br; \ + ctype_r* restrict b_i = ( ctype_r* )br + ps_b; \ + ctype_r* restrict b_ri = ( ctype_r* )br + 2*ps_b; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = PASTEMAC(chr,packmr); \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ + ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a12t * b21; */ \ + PASTEMAC(chr,set0s)( rho11_r ); \ + PASTEMAC(chr,set0s)( rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ + ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ + ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ + PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ +\ + /* Update the ri part of the packed panel. */ \ + PASTEMAC(chr,add3s)( beta11c_r, \ + beta11c_i, \ + *beta11_ri ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm3m_u_ukr_ref ) + diff --git a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h b/frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.h similarity index 82% rename from frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h rename to frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.h index 99eb0ce88..2595b5caf 100644 --- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.h +++ b/frame/3/trsm/3m/ukernels/bli_trsm3m_u_ukr_ref.h @@ -33,18 +33,15 @@ */ -// -// Prototype micro-kernel interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( trsm_l_ref_mxn ) +INSERT_GENTPROTCO_BASIC( trsm3m_u_ukr_ref ) diff --git a/frame/3/trsm/4m/bli_trsm4m.c b/frame/3/trsm/4m/bli_trsm4m.c new file mode 100644 index 000000000..011594010 --- /dev/null +++ b/frame/3/trsm/4m/bli_trsm4m.c @@ -0,0 +1,107 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern trsm_t* trsm4m_l_cntl; +extern trsm_t* trsm4m_r_cntl; +extern trsm_t* trsm_l_cntl; +extern trsm_t* trsm_r_cntl; + +// +// Define object-based interface. +// +void bli_trsm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ) +{ + trsm_t* l_cntl; + trsm_t* r_cntl; + + // Since 4m only applies to the complex domain, we use the regular + // control tree for real domain cases. + if ( bli_obj_is_complex( *b ) ) { l_cntl = trsm4m_l_cntl; r_cntl = trsm4m_r_cntl; } + else { l_cntl = trsm_l_cntl; r_cntl = trsm_r_cntl; } + + bli_trsm_front( side, alpha, a, b, + l_cntl, + r_cntl ); +} + +// +// Define BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, varname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + obj_t alphao, ao, bo; \ +\ + dim_t mn_a; \ +\ + bli_set_dim_with_side( side, m, n, mn_a ); \ +\ + bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ +\ + bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ + bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \ +\ + bli_obj_set_uplo( uploa, ao ); \ + bli_obj_set_diag( diaga, ao ); \ + bli_obj_set_conjtrans( transa, ao ); \ +\ + bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \ +\ + PASTEMAC0(opname)( side, \ + &alphao, \ + &ao, \ + &bo ); \ +} + +INSERT_GENTFUNC_BASIC( trsm4m, trsm4m ) + diff --git a/frame/3/trsm/4m/bli_trsm4m.h b/frame/3/trsm/4m/bli_trsm4m.h new file mode 100644 index 000000000..06374c3c6 --- /dev/null +++ b/frame/3/trsm/4m/bli_trsm4m.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_trsm4m_cntl.h" + +#include "bli_gemmtrsm4m_l_ukr_ref.h" +#include "bli_gemmtrsm4m_u_ukr_ref.h" + +#include "bli_trsm4m_l_ukr_ref.h" +#include "bli_trsm4m_u_ukr_ref.h" + + +// +// Prototype object-based interface. +// +void bli_trsm4m( side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b ); + + +// +// Prototype BLAS-like interfaces with homogeneous-typed operands. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname)( \ + side_t side, \ + uplo_t uploa, \ + trans_t transa, \ + diag_t diaga, \ + dim_t m, \ + dim_t n, \ + ctype* alpha, \ + ctype* a, inc_t rs_a, inc_t cs_a, \ + ctype* b, inc_t rs_b, inc_t cs_b \ + ); + +INSERT_GENTPROT_BASIC( trsm4m ) + diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.c b/frame/3/trsm/4m/bli_trsm4m_cntl.c new file mode 100644 index 000000000..facf7334f --- /dev/null +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.c @@ -0,0 +1,278 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern blksz_t* gemm4m_mc; +extern blksz_t* gemm4m_nc; +extern blksz_t* gemm4m_kc; +extern blksz_t* gemm4m_mr; +extern blksz_t* gemm4m_nr; +extern blksz_t* gemm4m_kr; + +extern func_t* gemm4m_ukrs; + +func_t* gemmtrsm4m_l_ukrs; +func_t* gemmtrsm4m_u_ukrs; + +packm_t* trsm4m_l_packa_cntl; +packm_t* trsm4m_l_packb_cntl; + +packm_t* trsm4m_r_packa_cntl; +packm_t* trsm4m_r_packb_cntl; + +trsm_t* trsm4m_cntl_bp_ke; + +trsm_t* trsm4m_l_cntl_op_bp; +trsm_t* trsm4m_l_cntl_mm_op; +trsm_t* trsm4m_l_cntl_vl_mm; + +trsm_t* trsm4m_r_cntl_op_bp; +trsm_t* trsm4m_r_cntl_mm_op; +trsm_t* trsm4m_r_cntl_vl_mm; + +trsm_t* trsm4m_l_cntl; +trsm_t* trsm4m_r_cntl; + + +void bli_trsm4m_cntl_init() +{ + + // Create function pointer objects for each datatype-specific + // gemmtrsm4m_l and gemmtrsm4m_u micro-kernel. + gemmtrsm4m_l_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMMTRSM4M_L_UKERNEL, + BLIS_ZGEMMTRSM4M_L_UKERNEL ); + + gemmtrsm4m_u_ukrs = bli_func_obj_create( NULL, + NULL, + BLIS_CGEMMTRSM4M_U_UKERNEL, + BLIS_ZGEMMTRSM4M_U_UKERNEL ); + + + // Create control tree objects for packm operations (left side). + trsm4m_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + gemm4m_mr, + gemm4m_mr, + TRUE, // densify + TRUE, // invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm4m_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + gemm4m_mr, + gemm4m_nr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trsm4m_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_nr, + gemm4m_mr, + FALSE, // already dense; densify not necessary + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS_4M, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm4m_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT4, + gemm4m_mr, + gemm4m_mr, + TRUE, // densify + TRUE, // invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS_4M, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trsm4m_cntl_bp_ke + = + bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + NULL, + gemm4m_ukrs, + gemmtrsm4m_l_ukrs, + gemmtrsm4m_u_ukrs, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trsm4m_l_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, NULL, NULL, + NULL, + trsm4m_l_packa_cntl, + trsm4m_l_packb_cntl, + NULL, + trsm4m_cntl_bp_ke, + NULL, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trsm4m_l_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm4m_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trsm4m_l_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm4m_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trsm4m_r_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm4m_mc, + NULL, NULL, NULL, + NULL, + trsm4m_r_packa_cntl, + trsm4m_r_packb_cntl, + NULL, + trsm4m_cntl_bp_ke, + NULL, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trsm4m_r_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm4m_kc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm4m_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trsm4m_r_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm4m_nc, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + trsm4m_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trsm control trees to shorter names. + trsm4m_l_cntl = trsm4m_l_cntl_vl_mm; + trsm4m_r_cntl = trsm4m_r_cntl_vl_mm; +} + +void bli_trsm4m_cntl_finalize() +{ + bli_func_obj_free( gemmtrsm4m_l_ukrs ); + bli_func_obj_free( gemmtrsm4m_u_ukrs ); + + bli_cntl_obj_free( trsm4m_l_packa_cntl ); + bli_cntl_obj_free( trsm4m_l_packb_cntl ); + bli_cntl_obj_free( trsm4m_r_packa_cntl ); + bli_cntl_obj_free( trsm4m_r_packb_cntl ); + + bli_cntl_obj_free( trsm4m_cntl_bp_ke ); + + bli_cntl_obj_free( trsm4m_l_cntl_op_bp ); + bli_cntl_obj_free( trsm4m_l_cntl_mm_op ); + bli_cntl_obj_free( trsm4m_l_cntl_vl_mm ); + bli_cntl_obj_free( trsm4m_r_cntl_op_bp ); + bli_cntl_obj_free( trsm4m_r_cntl_mm_op ); + bli_cntl_obj_free( trsm4m_r_cntl_vl_mm ); +} + diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.h b/frame/3/trsm/4m/bli_trsm4m_cntl.h new file mode 100644 index 000000000..2646efbab --- /dev/null +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.h @@ -0,0 +1,36 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void bli_trsm4m_cntl_init( void ); +void bli_trsm4m_cntl_finalize( void ); diff --git a/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.c b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.c new file mode 100644 index 000000000..fc188568c --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.c @@ -0,0 +1,158 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr, trsmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict b01, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = bli_auxinfo_ps_a( data ); \ + const inc_t ps_b = bli_auxinfo_ps_b( data ); \ +\ + ctype_r* restrict a10_r = ( ctype_r* )a10; \ + ctype_r* restrict a10_i = ( ctype_r* )a10 + ps_a; \ +\ + ctype_r* restrict a11_r = ( ctype_r* )a11; \ +\ + ctype_r* restrict b01_r = ( ctype_r* )b01; \ + ctype_r* restrict b01_i = ( ctype_r* )b01 + ps_b; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ps_b; \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on alpha, but to do + so we have to manually scale b and then use alpha == 1 for the + micro-kernel calls. */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + PASTEMAC(ch,scalris)( *alpha_r, \ + *alpha_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Use alpha.r == 1.0. */ \ + alpha_r = one_r; \ + } \ +\ +\ + /* b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i ); + b11.i = alpha.r * b11.r - ( a10.r * b01.i + a10.i * b01.r ); */ \ +\ + bli_auxinfo_set_next_ab( a10_r, b01_i, *data ); \ +\ + /* b11.r = alpha.r * b11.r - a10.r * b01.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a10_r, \ + b01_r, \ + alpha_r, \ + b11_r, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a10_i, b01_r, *data ); \ +\ + /* b11.i = alpha.r * b11.i - a10.r * b01.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a10_r, \ + b01_i, \ + alpha_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a10_i, b01_i, *data ); \ +\ + /* b11.i = 1.0 * b11.i - a10.i * b01.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a10_i, \ + b01_r, \ + one_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* b11.r = 1.0 * b11.r + a10.i * b01.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a10_i, \ + b01_i, \ + one_r, \ + b11_r, rs_b, cs_b, \ + data ); \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + PASTEMAC(ch,trsmukr)( a11_r, \ + b11_r, \ + c11, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m_l_ukr_ref, GEMM_UKERNEL, TRSM4M_L_UKERNEL ) + diff --git a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.h similarity index 90% rename from config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h rename to frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.h index dec7d4a17..5d0ddaa17 100644 --- a/config/template/kernels/3/bli_gemmtrsm_l_opt_mxn.h +++ b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_l_ukr_ref.h @@ -26,18 +26,15 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// -// Prototype micro-kernel interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -50,5 +47,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_l_opt_mxn ) +INSERT_GENTPROTCO_BASIC( gemmtrsm4m_l_ukr_ref ) diff --git a/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.c b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.c new file mode 100644 index 000000000..6318b6526 --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.c @@ -0,0 +1,158 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr, trsmukr ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict b21, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = bli_auxinfo_ps_a( data ); \ + const inc_t ps_b = bli_auxinfo_ps_b( data ); \ +\ + ctype_r* restrict a11_r = ( ctype_r* )a11; \ +\ + ctype_r* restrict a12_r = ( ctype_r* )a12; \ + ctype_r* restrict a12_i = ( ctype_r* )a12 + ps_a; \ +\ + ctype_r* restrict b11_r = ( ctype_r* )b11; \ + ctype_r* restrict b11_i = ( ctype_r* )b11 + ps_b; \ +\ + ctype_r* restrict b21_r = ( ctype_r* )b21; \ + ctype_r* restrict b21_i = ( ctype_r* )b21 + ps_b; \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ +\ + ctype_r* alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + void* a_next = bli_auxinfo_next_a( data ); \ + void* b_next = bli_auxinfo_next_b( data ); \ +\ + dim_t i, j; \ +\ +\ + /* Copy the contents of c to a temporary buffer ct. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + { \ + /* We can handle a non-zero imaginary component on alpha, but to do + so we have to manually scale b and then use alpha == 1 for the + micro-kernel calls. */ \ + for ( i = 0; i < m; ++i ) \ + for ( j = 0; j < n; ++j ) \ + PASTEMAC(ch,scalris)( *alpha_r, \ + *alpha_i, \ + *(b11_r + i*rs_b + j*cs_b), \ + *(b11_i + i*rs_b + j*cs_b) ); \ +\ + /* Use alpha.r == 1.0. */ \ + alpha_r = one_r; \ + } \ +\ +\ + /* b11.r = alpha.r * b11.r - ( a12.r * b21.r - a12.i * b21.i ); + b11.i = alpha.r * b11.r - ( a12.r * b21.i + a12.i * b21.r ); */ \ +\ + bli_auxinfo_set_next_ab( a12_r, b21_i, *data ); \ +\ + /* b11.r = alpha.r * b11.r - a12.r * b21.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a12_r, \ + b21_r, \ + alpha_r, \ + b11_r, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a12_i, b21_r, *data ); \ +\ + /* b11.i = alpha.r * b11.i - a12.r * b21.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a12_r, \ + b21_i, \ + alpha_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a12_i, b21_i, *data ); \ +\ + /* b11.i = 1.0 * b11.i - a12.i * b21.r; */ \ + PASTEMAC(chr,gemmukr)( k, \ + minus_one_r, \ + a12_i, \ + b21_r, \ + one_r, \ + b11_i, rs_b, cs_b, \ + data ); \ +\ + bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ +\ + /* b11.r = 1.0 * b11.r + a12.i * b21.i; */ \ + PASTEMAC(chr,gemmukr)( k, \ + one_r, \ + a12_i, \ + b21_i, \ + one_r, \ + b11_r, rs_b, cs_b, \ + data ); \ +\ +\ + /* b11 = inv(a11) * b11; + c11 = b11; */ \ + PASTEMAC(ch,trsmukr)( a11_r, \ + b11_r, \ + c11, rs_c, cs_c, \ + data ); \ +} + +INSERT_GENTFUNCCO_BASIC2( gemmtrsm4m_u_ukr_ref, GEMM_UKERNEL, TRSM4M_U_UKERNEL ) + diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.h similarity index 90% rename from frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h rename to frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.h index 3b50841f8..1e79bed3b 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.h +++ b/frame/3/trsm/4m/ukernels/bli_gemmtrsm4m_u_ukr_ref.h @@ -26,18 +26,15 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -// -// Prototype micro-kernel interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t k, \ @@ -50,5 +47,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_mxn ) +INSERT_GENTPROTCO_BASIC( gemmtrsm4m_u_ukr_ref ) diff --git a/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.c b/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.c new file mode 100644 index 000000000..4fafd4fa9 --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.c @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = bli_auxinfo_ps_a( data ); \ + const inc_t ps_b = bli_auxinfo_ps_b( data ); \ +\ + ctype_r* restrict a_r = ( ctype_r* )ar; \ + ctype_r* restrict a_i = ( ctype_r* )ar + ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )br; \ + ctype_r* restrict b_i = ( ctype_r* )br + ps_b; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = PASTEMAC(chr,packmr); \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = iter; \ + n_behind = i; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a10t * B0; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a10t * b01; */ \ + PASTEMAC(chr,set0s)( rho11_r ); \ + PASTEMAC(chr,set0s)( rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \ + ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \ + ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \ + ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \ +\ + PASTEMAC(ch,axpyris)( *alpha10_r, \ + *alpha10_i, \ + *beta01_r, \ + *beta01_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ + PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm4m_l_ukr_ref ) + diff --git a/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.h b/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.h new file mode 100644 index 000000000..a15fffc7d --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_trsm4m_l_ukr_ref.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( trsm4m_l_ukr_ref ) + diff --git a/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.c b/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.c new file mode 100644 index 000000000..1228f0711 --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.c @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ) \ +{ \ + const dim_t m = PASTEMAC(chr,mr); \ + const dim_t n = PASTEMAC(chr,nr); \ +\ + const inc_t ps_a = bli_auxinfo_ps_a( data ); \ + const inc_t ps_b = bli_auxinfo_ps_b( data ); \ +\ + ctype_r* restrict a_r = ( ctype_r* )ar; \ + ctype_r* restrict a_i = ( ctype_r* )ar + ps_a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )br; \ + ctype_r* restrict b_i = ( ctype_r* )br + ps_b; \ +\ + const inc_t rs_a = 1; \ + const inc_t cs_a = PASTEMAC(chr,packmr); \ +\ + const inc_t rs_b = PASTEMAC(chr,packnr); \ + const inc_t cs_b = 1; \ +\ + dim_t iter, i, j, l; \ + dim_t n_behind; \ +\ +\ + for ( iter = 0; iter < m; ++iter ) \ + { \ + i = m - iter - 1; \ + n_behind = iter; \ +\ + ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \ + ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \ + ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \ + ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \ + ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \ +\ + /* b1 = b1 - a12t * B2; */ \ + /* b1 = b1 / alpha11; */ \ + for ( j = 0; j < n; ++j ) \ + { \ + ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \ + ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \ + ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype_r beta11c_r = *beta11_r; \ + ctype_r beta11c_i = *beta11_i; \ + ctype_r rho11_r; \ + ctype_r rho11_i; \ +\ + /* beta11 = beta11 - a12t * b21; */ \ + PASTEMAC(chr,set0s)( rho11_r ); \ + PASTEMAC(chr,set0s)( rho11_i ); \ + for ( l = 0; l < n_behind; ++l ) \ + { \ + ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \ + ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \ + ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \ + ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \ +\ + PASTEMAC(ch,axpyris)( *alpha12_r, \ + *alpha12_i, \ + *beta21_r, \ + *beta21_i, \ + rho11_r, \ + rho11_i ); \ + } \ + PASTEMAC(ch,subris)( rho11_r, \ + rho11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* beta11 = beta11 / alpha11; */ \ + /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead + of alpha11, so we can multiply rather than divide. We store + the inverse of alpha11 intentionally to avoid expensive + division instructions within the micro-kernel. */ \ + PASTEMAC(ch,scalris)( *alpha11_r, \ + *alpha11_i, \ + beta11c_r, \ + beta11c_i ); \ +\ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,sets)( beta11c_r, \ + beta11c_i, *gamma11 ); \ +\ + /* Store the local values back to b11. */ \ + PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \ + PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC0( trsm4m_u_ukr_ref ) + diff --git a/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.h b/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.h new file mode 100644 index 000000000..774e3660f --- /dev/null +++ b/frame/3/trsm/4m/ukernels/bli_trsm4m_u_ukr_ref.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + ctype_r* restrict ar, \ + ctype_r* restrict br, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( trsm4m_u_ukr_ref ) + diff --git a/frame/3/trsm/bli_trsm.c b/frame/3/trsm/bli_trsm.c index 5a00adc28..a9dad8b0f 100644 --- a/frame/3/trsm/bli_trsm.c +++ b/frame/3/trsm/bli_trsm.c @@ -45,6 +45,17 @@ void bli_trsm( side_t side, obj_t* a, obj_t* b ) { + if ( +#ifdef BLIS_ENABLE_SCOMPLEX_VIA_4M + bli_obj_is_scomplex( *b ) || +#endif +#ifdef BLIS_ENABLE_DCOMPLEX_VIA_4M + bli_obj_is_dcomplex( *b ) || +#endif + FALSE + ) + return bli_trsm4m( side, alpha, a, b ); + bli_trsm_front( side, alpha, a, b, trsm_l_cntl, trsm_r_cntl ); diff --git a/frame/3/trsm/bli_trsm.h b/frame/3/trsm/bli_trsm.h index 453695747..5fa21cccc 100644 --- a/frame/3/trsm/bli_trsm.h +++ b/frame/3/trsm/bli_trsm.h @@ -51,11 +51,14 @@ #include "bli_trsm_rl_ker_var2.h" #include "bli_trsm_ru_ker_var2.h" -#include "bli_gemmtrsm_l_ref_mxn.h" -#include "bli_gemmtrsm_u_ref_mxn.h" +#include "bli_gemmtrsm_l_ukr_ref.h" +#include "bli_gemmtrsm_u_ukr_ref.h" -#include "bli_trsm_l_ref_mxn.h" -#include "bli_trsm_u_ref_mxn.h" +#include "bli_trsm_l_ukr_ref.h" +#include "bli_trsm_u_ukr_ref.h" + +#include "bli_trsm4m.h" +#include "bli_trsm3m.h" // diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 3b9ae3478..ed53ad1a9 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -107,10 +107,11 @@ void bli_trsm_blk_var3b( obj_t* a, cntl_sub_trsm( cntl ) ); // This variant executes multiple rank-k updates. Therefore, if the - // internal alpha scalar on matrix A/B is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). + // internal alpha scalars on A/B and C are non-zero, we must ensure + // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); } + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( &c_pack ); } } // Unpack C (if C was packed). diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 625909d23..5a680accd 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -107,10 +107,11 @@ void bli_trsm_blk_var3f( obj_t* a, cntl_sub_trsm( cntl ) ); // This variant executes multiple rank-k updates. Therefore, if the - // internal alpha scalar on matrix A/B is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). + // internal alpha scalars on A/B and C are non-zero, we must ensure + // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); } + bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( &c_pack ); } } // Unpack C (if C was packed). diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index ae283c50c..a9bec29bd 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -73,6 +73,8 @@ trsm_t* trsm_r_cntl; void bli_trsm_cntl_init() { + // Create function pointer objects for each datatype-specific + // gemmtrsm_l and gemmtrsm_u micro-kernel. gemmtrsm_l_ukrs = bli_func_obj_create( BLIS_SGEMMTRSM_L_UKERNEL, BLIS_DGEMMTRSM_L_UKERNEL, BLIS_CGEMMTRSM_L_UKERNEL, @@ -88,7 +90,7 @@ void bli_trsm_cntl_init() trsm_l_packa_cntl = bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of A compactly + BLIS_VARIANT1, // IMPORTANT: n dim multiple must be mr to // support right and bottom-right edge cases gemm_mr, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 8ed6ec8d9..1dd67ece5 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -129,7 +129,7 @@ void bli_trsm_front( side_t side, bli_trsm_int( alpha, &a_local, &b_local, - &BLIS_ZERO, + alpha, &c_local, cntl ); } diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index ceead3b44..11b39cc59 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -41,9 +41,10 @@ typedef void (*FUNCPTR_T)( dim_t m, dim_t n, dim_t k, - void* alpha, + void* alpha1, void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, + void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, void* gemm_ukr @@ -79,7 +80,8 @@ void bli_trsm_ll_ker_var2( obj_t* a, inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); - void* buf_alpha; + void* buf_alpha1; + void* buf_alpha2; FUNCPTR_T f; @@ -90,13 +92,34 @@ void bli_trsm_ll_ker_var2( obj_t* a, // Grab the address of the internal scalar buffer for the scalar - // attached to B. - buf_alpha = bli_obj_internal_scalar_buffer( *b ); + // attached to B. This will be the alpha scalar used in the gemmtrsm + // subproblems (ie: the scalar that would be applied to the packed + // copy of B prior to it being updated by the trsm subproblem). This + // scalar may be unit, if for example it was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. @@ -110,9 +133,10 @@ void bli_trsm_ll_ker_var2( obj_t* a, m, n, k, - buf_alpha, + buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, + buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); @@ -127,9 +151,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ @@ -152,16 +177,18 @@ void PASTEMAC(ch,varname)( \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -171,9 +198,10 @@ void PASTEMAC(ch,varname)( \ dim_t off_a10; \ dim_t off_a11; \ dim_t i, j; \ - dim_t rstep_a; \ - dim_t cstep_b; \ - dim_t rstep_c, cstep_c; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t ss_a; \ auxinfo_t aux; \ \ /* @@ -196,6 +224,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix A, which is + usually PACKMR. However, in the case of 3m, the storage stride + captures the (PACKMR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of MR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ + ss_a = ps_a / k_full; \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as @@ -309,7 +346,7 @@ void PASTEMAC(ch,varname)( \ b11 = b1 + off_a11 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1011 * PACKMR; \ + a2 = a1 + k_a1011 * ss_a; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ @@ -325,14 +362,14 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of A to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1011 * PACKMR, aux ); \ + bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_a10, \ - alpha_cast, \ + alpha1_cast, \ a10, \ a11, \ b01, \ @@ -344,7 +381,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_a10, \ - alpha_cast, \ + alpha1_cast, \ a10, \ a11, \ b01, \ @@ -358,7 +395,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1011 * PACKMR; \ + a1 += k_a1011 * ss_a; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ @@ -391,7 +428,7 @@ void PASTEMAC(ch,varname)( \ minus_one, \ a1, \ b1, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c, \ &aux ); \ } \ @@ -409,14 +446,14 @@ void PASTEMAC(ch,varname)( \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ - c11 += rstep_c; \ + c11 += rstep_c; \ } \ \ b1 += cstep_b; \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.h b/frame/3/trsm/bli_trsm_ll_ker_var2.h index 1b6aa95fd..59e8e576b 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.h @@ -53,9 +53,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 7062b0cd6..d86a87ca0 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -41,9 +41,10 @@ typedef void (*FUNCPTR_T)( dim_t m, dim_t n, dim_t k, - void* alpha, + void* alpha1, void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, + void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, void* gemm_ukr @@ -79,7 +80,8 @@ void bli_trsm_lu_ker_var2( obj_t* a, inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); - void* buf_alpha; + void* buf_alpha1; + void* buf_alpha2; FUNCPTR_T f; @@ -90,13 +92,34 @@ void bli_trsm_lu_ker_var2( obj_t* a, // Grab the address of the internal scalar buffer for the scalar - // attached to B. - buf_alpha = bli_obj_internal_scalar_buffer( *b ); + // attached to B. This will be the alpha scalar used in the gemmtrsm + // subproblems (ie: the scalar that would be applied to the packed + // copy of B prior to it being updated by the trsm subproblem). This + // scalar may be unit, if for example it was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. @@ -110,9 +133,10 @@ void bli_trsm_lu_ker_var2( obj_t* a, m, n, k, - buf_alpha, + buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, + buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); @@ -127,9 +151,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ @@ -152,16 +177,18 @@ void PASTEMAC(ch,varname)( \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* b_cast = b; \ - ctype* c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -172,9 +199,10 @@ void PASTEMAC(ch,varname)( \ dim_t off_a11; \ dim_t off_a12; \ dim_t i, j, ib; \ - dim_t rstep_a; \ - dim_t cstep_b; \ - dim_t rstep_c, cstep_c; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t ss_a; \ auxinfo_t aux; \ \ /* @@ -197,6 +225,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix A, which is + usually PACKMR. However, in the case of 3m, the storage stride + captures the (PACKMR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of MR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ + ss_a = ps_a / k_full; \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and @@ -320,7 +357,7 @@ void PASTEMAC(ch,varname)( \ b21 = b1 + off_a12 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ - a2 = a1 + k_a1112 * PACKMR; \ + a2 = a1 + k_a1112 * ss_a; \ if ( bli_is_last_iter( ib, m_iter ) ) \ { \ a2 = a_cast; \ @@ -336,14 +373,14 @@ void PASTEMAC(ch,varname)( \ \ /* Save the panel stride of the current panel of A to the auxinfo_t object. */ \ - bli_auxinfo_set_ps_a( k_a1112 * PACKMR, aux ); \ + bli_auxinfo_set_ps_a( k_a1112 * ss_a, aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_a12, \ - alpha_cast, \ + alpha1_cast, \ a12, \ a11, \ b21, \ @@ -355,7 +392,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_a12, \ - alpha_cast, \ + alpha1_cast, \ a12, \ a11, \ b21, \ @@ -369,7 +406,7 @@ void PASTEMAC(ch,varname)( \ c11, rs_c, cs_c ); \ } \ \ - a1 += k_a1112 * PACKMR; \ + a1 += k_a1112 * ss_a; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ @@ -402,7 +439,7 @@ void PASTEMAC(ch,varname)( \ minus_one, \ a1, \ b1, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c, \ &aux ); \ } \ @@ -420,7 +457,7 @@ void PASTEMAC(ch,varname)( \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.h b/frame/3/trsm/bli_trsm_lu_ker_var2.h index 8b3f0848e..50b18cf79 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.h +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.h @@ -53,9 +53,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 837c76626..ffa41aa9a 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -41,9 +41,10 @@ typedef void (*FUNCPTR_T)( dim_t m, dim_t n, dim_t k, - void* alpha, + void* alpha1, void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, + void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, void* gemm_ukr @@ -79,7 +80,8 @@ void bli_trsm_rl_ker_var2( obj_t* a, inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); - void* buf_alpha; + void* buf_alpha1; + void* buf_alpha2; FUNCPTR_T f; @@ -90,13 +92,34 @@ void bli_trsm_rl_ker_var2( obj_t* a, // Grab the address of the internal scalar buffer for the scalar - // attached to B. - buf_alpha = bli_obj_internal_scalar_buffer( *a ); + // attached to A. This will be the alpha scalar used in the gemmtrsm + // subproblems (ie: the scalar that would be applied to the packed + // copy of A prior to it being updated by the trsm subproblem). This + // scalar may be unit, if for example it was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. @@ -110,9 +133,10 @@ void bli_trsm_rl_ker_var2( obj_t* a, m, n, k, - buf_alpha, + buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, + buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); @@ -127,9 +151,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ @@ -152,16 +177,18 @@ void PASTEMAC(ch,varname)( \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* b_cast = b; \ - ctype* c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -172,9 +199,10 @@ void PASTEMAC(ch,varname)( \ dim_t off_b11; \ dim_t off_b21; \ dim_t i, j, jb; \ - dim_t rstep_a; \ - dim_t cstep_b; \ - dim_t rstep_c, cstep_c; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t ss_b; \ auxinfo_t aux; \ \ /* @@ -205,6 +233,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current panel of B is entirely above its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix B, which is + usually PACKNR. However, in the case of 3m, the storage stride + captures the (PACKNR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of NR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ + ss_b = ps_b / k_full; \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this @@ -314,7 +351,7 @@ void PASTEMAC(ch,varname)( \ /* Save the panel stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_ps_a( k_b1121 * PACKNR, aux ); \ + bli_auxinfo_set_ps_a( k_b1121 * ss_b, aux ); \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. @@ -342,7 +379,7 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b1121 * PACKNR; \ + b2 = b1 + k_b1121 * ss_b; \ if ( bli_is_last_iter( jb, n_iter ) ) \ b2 = b_cast; \ } \ @@ -358,7 +395,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_b21, \ - alpha_cast, \ + alpha1_cast, \ b21, \ b11, \ a12, \ @@ -370,7 +407,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_b21, \ - alpha_cast, \ + alpha1_cast, \ b21, \ b11, \ a12, \ @@ -421,7 +458,7 @@ void PASTEMAC(ch,varname)( \ minus_one, \ b1, \ a1, \ - alpha_cast, \ + alpha2_cast, \ c11, cs_c, rs_c, \ &aux ); \ } \ @@ -439,7 +476,7 @@ void PASTEMAC(ch,varname)( \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ @@ -448,7 +485,7 @@ void PASTEMAC(ch,varname)( \ } \ } \ \ - b1 += k_b1121 * PACKNR; \ + b1 += k_b1121 * ss_b; \ c1 -= cstep_c; \ } \ } diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.h b/frame/3/trsm/bli_trsm_rl_ker_var2.h index 125b2a8eb..a0605a7b7 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.h +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.h @@ -53,9 +53,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 0977705bd..44fe387a6 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -41,9 +41,10 @@ typedef void (*FUNCPTR_T)( dim_t m, dim_t n, dim_t k, - void* alpha, + void* alpha1, void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, + void* alpha2, void* c, inc_t rs_c, inc_t cs_c, void* gemmtrsm_ukr, void* gemm_ukr @@ -79,7 +80,8 @@ void bli_trsm_ru_ker_var2( obj_t* a, inc_t rs_c = bli_obj_row_stride( *c ); inc_t cs_c = bli_obj_col_stride( *c ); - void* buf_alpha; + void* buf_alpha1; + void* buf_alpha2; FUNCPTR_T f; @@ -90,13 +92,34 @@ void bli_trsm_ru_ker_var2( obj_t* a, // Grab the address of the internal scalar buffer for the scalar - // attached to B. - buf_alpha = bli_obj_internal_scalar_buffer( *a ); + // attached to A. This will be the alpha scalar used in the gemmtrsm + // subproblems (ie: the scalar that would be applied to the packed + // copy of A prior to it being updated by the trsm subproblem). This + // scalar may be unit, if for example it was applied during packing. + buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); + + // Grab the address of the internal scalar buffer for the scalar + // attached to C. This will be the "beta" scalar used in the gemm-only + // subproblems that correspond to micro-panels that do not intersect + // the diagonal. We need this separate scalar because it's possible + // that the alpha attached to B was reset, if it was applied during + // packing. + buf_alpha2 = bli_obj_internal_scalar_buffer( *c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; + // Adjust cs_a and rs_b if A and B were packed for 4m or 3m. This + // is needed because cs_a and rs_b are used to index into the + // micro-panels of A and B, respectively, and since the pointer + // types in the macro-kernel (scomplex or dcomplex) will result + // in pointer arithmetic that moves twice as far as it should, + // given the datatypes actually stored (float or double), we must + // halve the strides to compensate. + if ( bli_obj_is_panel_packed_4m( *a ) || + bli_obj_is_panel_packed_3m( *a ) ) { cs_a /= 2; rs_b /= 2; } + // Extract from the control tree node the func_t objects containing // the gemmtrsm and gemm micro-kernel function addresses, and then // query the function addresses corresponding to the current datatype. @@ -110,9 +133,10 @@ void bli_trsm_ru_ker_var2( obj_t* a, m, n, k, - buf_alpha, + buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, + buf_alpha2, buf_c, rs_c, cs_c, gemmtrsm_ukr, gemm_ukr ); @@ -127,9 +151,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ @@ -152,16 +177,18 @@ void PASTEMAC(ch,varname)( \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ - ctype* restrict zero = PASTEMAC(ch,0); \ - ctype* restrict minus_one = PASTEMAC(ch,m1); \ - ctype* restrict a_cast = a; \ - ctype* restrict b_cast = b; \ - ctype* restrict c_cast = c; \ - ctype* restrict alpha_cast = alpha; \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict minus_one = PASTEMAC(ch,m1); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha1_cast = alpha1; \ + ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ + dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ @@ -171,9 +198,10 @@ void PASTEMAC(ch,varname)( \ dim_t off_b01; \ dim_t off_b11; \ dim_t i, j; \ - dim_t rstep_a; \ - dim_t cstep_b; \ - dim_t rstep_c, cstep_c; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + inc_t ss_b; \ auxinfo_t aux; \ \ /* @@ -204,6 +232,15 @@ void PASTEMAC(ch,varname)( \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ +\ + /* Compute the storage stride for the triangular matrix B, which is + usually PACKNR. However, in the case of 3m, the storage stride + captures the (PACKNR * 3/2) factor embedded in the panel stride. + Notice that we must first inflate k up to a multiple of NR, since + the panel stride was originally computed using this inflated k + dimension. */ \ + k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ + ss_b = ps_b / k_full; \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and @@ -308,7 +345,7 @@ void PASTEMAC(ch,varname)( \ /* Save the panel stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ - bli_auxinfo_set_ps_a( k_b0111 * PACKNR, aux ); \ + bli_auxinfo_set_ps_a( k_b0111 * ss_b, aux ); \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. @@ -336,7 +373,7 @@ void PASTEMAC(ch,varname)( \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ - b2 = b1 + k_b0111 * PACKNR; \ + b2 = b1 + k_b0111 * ss_b; \ if ( bli_is_last_iter( j, n_iter ) ) \ b2 = b_cast; \ } \ @@ -352,7 +389,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_b01, \ - alpha_cast, \ + alpha1_cast, \ b01, \ b11, \ a10, \ @@ -364,7 +401,7 @@ void PASTEMAC(ch,varname)( \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr_cast( k_b01, \ - alpha_cast, \ + alpha1_cast, \ b01, \ b11, \ a10, \ @@ -415,7 +452,7 @@ void PASTEMAC(ch,varname)( \ minus_one, \ b1, \ a1, \ - alpha_cast, \ + alpha2_cast, \ c11, cs_c, rs_c, \ &aux ); \ } \ @@ -433,7 +470,7 @@ void PASTEMAC(ch,varname)( \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ - alpha_cast, \ + alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ @@ -442,7 +479,7 @@ void PASTEMAC(ch,varname)( \ } \ } \ \ - b1 += k_b0111 * PACKNR; \ + b1 += k_b0111 * ss_b; \ c1 += cstep_c; \ } \ } diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.h b/frame/3/trsm/bli_trsm_ru_ker_var2.h index 6bbe323f1..ebb24b81f 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.h +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.h @@ -53,9 +53,10 @@ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ - void* alpha, \ + void* alpha1, \ void* a, inc_t cs_a, inc_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \ + void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ void* gemmtrsm_ukr, \ void* gemm_ukr \ diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c similarity index 97% rename from frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c rename to frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c index ce75a0268..a5f89de8f 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.c @@ -71,5 +71,5 @@ void PASTEMAC(ch,varname)( \ data ); \ } -INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ref_mxn, GEMM_UKERNEL, TRSM_L_UKERNEL ) +INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukr_ref, GEMM_UKERNEL, TRSM_L_UKERNEL ) diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.h similarity index 98% rename from frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h rename to frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.h index f30b67368..d60689d49 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_l_ref_mxn.h +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_l_ukr_ref.h @@ -50,5 +50,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_mxn ) +INSERT_GENTPROT_BASIC( gemmtrsm_l_ukr_ref ) diff --git a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.c similarity index 97% rename from frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c rename to frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.c index 69eb8d9a7..e548ba33a 100644 --- a/frame/3/trsm/ukernels/bli_gemmtrsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.c @@ -71,5 +71,5 @@ void PASTEMAC(ch,varname)( \ data ); \ } -INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ref_mxn, GEMM_UKERNEL, TRSM_U_UKERNEL ) +INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ukr_ref, GEMM_UKERNEL, TRSM_U_UKERNEL ) diff --git a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.h similarity index 98% rename from config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h rename to frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.h index f196d3433..e17880937 100644 --- a/config/template/kernels/3/bli_gemmtrsm_u_opt_mxn.h +++ b/frame/3/trsm/ukernels/bli_gemmtrsm_u_ukr_ref.h @@ -50,5 +50,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( gemmtrsm_u_opt_mxn ) +INSERT_GENTPROT_BASIC( gemmtrsm_u_ukr_ref ) diff --git a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.c similarity index 90% rename from frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c rename to frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.c index ba3d688cc..93d80f3d9 100644 --- a/frame/3/trsm/ukernels/bli_trsm_l_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.c @@ -56,8 +56,6 @@ void PASTEMAC(ch,varname)( \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ -\ - ctype rho11; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -76,6 +74,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype beta11c = *beta11; \ + ctype rho11; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0s)( rho11 ); \ @@ -86,20 +86,23 @@ void PASTEMAC(ch,varname)( \ \ PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \ } \ - PASTEMAC(ch,subs)( rho11, *beta11 ); \ + PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scals)( *alpha11, *beta11 ); \ + PASTEMAC(ch,scals)( *alpha11, beta11c ); \ \ - /* Output final result to matrix C. */ \ - PASTEMAC(ch,copys)( *beta11, *gamma11 ); \ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ +\ + /* Store the local value back to b11. */ \ + PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } -INSERT_GENTFUNC_BASIC0( trsm_l_ref_mxn ) +INSERT_GENTFUNC_BASIC0( trsm_l_ukr_ref ) diff --git a/config/template/kernels/3/bli_trsm_l_opt_mxn.h b/frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.h similarity index 98% rename from config/template/kernels/3/bli_trsm_l_opt_mxn.h rename to frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.h index 9578199e7..3ffecfbbc 100644 --- a/config/template/kernels/3/bli_trsm_l_opt_mxn.h +++ b/frame/3/trsm/ukernels/bli_trsm_l_ukr_ref.h @@ -46,5 +46,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( trsm_l_opt_mxn ) +INSERT_GENTPROT_BASIC( trsm_l_ukr_ref ) diff --git a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h b/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h deleted file mode 100644 index 68db71094..000000000 --- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype micro-kernel interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_u_ref_mxn ) - diff --git a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c b/frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.c similarity index 90% rename from frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c rename to frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.c index a492dda09..d10d2df9b 100644 --- a/frame/3/trsm/ukernels/bli_trsm_u_ref_mxn.c +++ b/frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.c @@ -56,8 +56,6 @@ void PASTEMAC(ch,varname)( \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ -\ - ctype rho11; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ @@ -76,6 +74,8 @@ void PASTEMAC(ch,varname)( \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ + ctype beta11c = *beta11; \ + ctype rho11; \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ @@ -86,20 +86,23 @@ void PASTEMAC(ch,varname)( \ \ PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \ } \ - PASTEMAC(ch,subs)( rho11, *beta11 ); \ + PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ \ - PASTEMAC(ch,scals)( *alpha11, *beta11 ); \ + PASTEMAC(ch,scals)( *alpha11, beta11c ); \ \ - /* Output final result to matrix C. */ \ - PASTEMAC(ch,copys)( *beta11, *gamma11 ); \ + /* Output final result to matrix c. */ \ + PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ +\ + /* Store the local value back to b11. */ \ + PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } -INSERT_GENTFUNC_BASIC0( trsm_u_ref_mxn ) +INSERT_GENTFUNC_BASIC0( trsm_u_ukr_ref ) diff --git a/config/template/kernels/3/bli_trsm_u_opt_mxn.h b/frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.h similarity index 98% rename from config/template/kernels/3/bli_trsm_u_opt_mxn.h rename to frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.h index 2f9bd8302..46d63e513 100644 --- a/config/template/kernels/3/bli_trsm_u_opt_mxn.h +++ b/frame/3/trsm/ukernels/bli_trsm_u_ukr_ref.h @@ -46,5 +46,5 @@ void PASTEMAC(ch,varname)( \ auxinfo_t* data \ ); -INSERT_GENTPROT_BASIC( trsm_u_opt_mxn ) +INSERT_GENTPROT_BASIC( trsm_u_ukr_ref ) diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c index aa1f194f1..5492aa997 100644 --- a/frame/base/bli_obj_scalar.c +++ b/frame/base/bli_obj_scalar.c @@ -134,7 +134,7 @@ void bli_obj_scalar_reset( obj_t* a ) else if ( bli_is_dcomplex( dt ) ) *(( dcomplex* )scalar_a) = *(( dcomplex* )one); // Alternate implementation: - //bli_obj_scalar_attach( &BLIS_ONE, a ); + //bli_obj_scalar_attach( BLIS_NO_CONJUGATE, &BLIS_ONE, a ); } bool_t bli_obj_scalar_has_nonzero_imag( obj_t* a ) diff --git a/frame/cntl/bli_cntl_init.c b/frame/cntl/bli_cntl_init.c index c0e9c7ecd..8ee9caefd 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/cntl/bli_cntl_init.c @@ -60,6 +60,18 @@ void bli_cntl_init( void ) bli_herk_cntl_init(); bli_trmm_cntl_init(); bli_trsm_cntl_init(); + + // Level-3 via 4m + bli_gemm4m_cntl_init(); + bli_herk4m_cntl_init(); + bli_trmm4m_cntl_init(); + bli_trsm4m_cntl_init(); + + // Level-3 via 3m + bli_gemm3m_cntl_init(); + bli_herk3m_cntl_init(); + bli_trmm3m_cntl_init(); + bli_trsm3m_cntl_init(); } void bli_cntl_finalize( void ) @@ -88,5 +100,17 @@ void bli_cntl_finalize( void ) bli_herk_cntl_finalize(); bli_trmm_cntl_finalize(); bli_trsm_cntl_finalize(); + + // Level-3 via 4m + bli_gemm4m_cntl_finalize(); + bli_herk4m_cntl_finalize(); + bli_trmm4m_cntl_finalize(); + bli_trsm4m_cntl_finalize(); + + // Level-3 via 3m + bli_gemm3m_cntl_finalize(); + bli_herk3m_cntl_finalize(); + bli_trmm3m_cntl_finalize(); + bli_trsm3m_cntl_finalize(); } diff --git a/frame/include/bli_auxinfo_macro_defs.h b/frame/include/bli_auxinfo_macro_defs.h index 6f37d20b5..0279b7cd2 100644 --- a/frame/include/bli_auxinfo_macro_defs.h +++ b/frame/include/bli_auxinfo_macro_defs.h @@ -50,6 +50,12 @@ #define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; } #define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; } +#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \ +{ \ + bli_auxinfo_set_next_a( a_p, auxinfo ); \ + bli_auxinfo_set_next_b( b_p, auxinfo ); \ +} + #define bli_auxinfo_set_ps_a( a_p, auxinfo ) { (auxinfo).ps_a = a_p; } #define bli_auxinfo_set_ps_b( b_p, auxinfo ) { (auxinfo).ps_b = b_p; } diff --git a/frame/include/bli_complex_macro_defs.h b/frame/include/bli_complex_macro_defs.h index 8bbe2d4b1..c107dffdf 100644 --- a/frame/include/bli_complex_macro_defs.h +++ b/frame/include/bli_complex_macro_defs.h @@ -45,16 +45,7 @@ #define bli_dimag( x ) ( 0.0 ) -#ifdef BLIS_ENABLE_C99_COMPLEX - - -#define bli_creal( x ) ( crealf(x) ) -#define bli_cimag( x ) ( cimagf(x) ) -#define bli_zreal( x ) ( creal(x) ) -#define bli_zimag( x ) ( cimag(x) ) - - -#else // ifndef BLIS_ENABLE_C99_COMPLEX +#ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) @@ -63,6 +54,15 @@ #define bli_zimag( x ) ( (x).imag ) +#else // ifdef BLIS_ENABLE_C99_COMPLEX + + +#define bli_creal( x ) ( crealf(x) ) +#define bli_cimag( x ) ( cimagf(x) ) +#define bli_zreal( x ) ( creal(x) ) +#define bli_zimag( x ) ( cimag(x) ) + + #endif // BLIS_ENABLE_C99_COMPLEX diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h index bf9537936..19dc97e6b 100644 --- a/frame/include/bli_gentfunc_macro_defs.h +++ b/frame/include/bli_gentfunc_macro_defs.h @@ -194,6 +194,33 @@ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) +// -- Basic one-operand macro with complex domain only and real projection (with no auxiliary arguments) -- + + +#define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ +\ +GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ +GENTFUNCCO( dcomplex, double, z, d, tfuncname ) + + +// -- Basic one-operand macro with complex domain only and real projection -- + + +#define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ +\ +GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ +GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) + + +// -- Basic one-operand macro with complex domain only and real projection (with two auxiliary arguments) -- + + +#define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ +\ +GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ +GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) + + // -- Basic one-operand with real and integer projections -- diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h index 02f79c356..4d0f211f4 100644 --- a/frame/include/bli_gentprot_macro_defs.h +++ b/frame/include/bli_gentprot_macro_defs.h @@ -171,6 +171,15 @@ GENTPROTR( scomplex, float, c, s, funcname ) \ GENTPROTR( dcomplex, double, z, d, funcname ) +// -- Basic one-operand macro with complex domain only and real projection -- + + +#define INSERT_GENTPROTCO_BASIC( funcname ) \ +\ +GENTPROTCO( scomplex, float, c, s, funcname ) \ +GENTPROTCO( dcomplex, double, z, d, funcname ) + + // -- Basic one-operand with real and integer projections -- diff --git a/frame/include/bli_kernel_3m_macro_defs.h b/frame/include/bli_kernel_3m_macro_defs.h new file mode 100644 index 000000000..20d96ebc7 --- /dev/null +++ b/frame/include/bli_kernel_3m_macro_defs.h @@ -0,0 +1,413 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_3M_MACRO_DEFS_H +#define BLIS_KERNEL_3M_MACRO_DEFS_H + + +// -- Define datatype-agnostic base 3m kernel names ---------------------------- + +// +// Level-3 3m +// + +// gemm3m micro-kernels + +#ifndef BLIS_CGEMM3M_UKERNEL +#define BLIS_CGEMM3M_UKERNEL BLIS_CGEMM3M_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM3M_UKERNEL +#define BLIS_ZGEMM3M_UKERNEL BLIS_ZGEMM3M_UKERNEL_REF +#endif + +// gemmtrsm3m_l micro-kernels + +#ifndef BLIS_CGEMMTRSM3M_L_UKERNEL +#define BLIS_CGEMMTRSM3M_L_UKERNEL BLIS_CGEMMTRSM3M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM3M_L_UKERNEL +#define BLIS_ZGEMMTRSM3M_L_UKERNEL BLIS_ZGEMMTRSM3M_L_UKERNEL_REF +#endif + +// gemmtrsm3m_u micro-kernels + +#ifndef BLIS_CGEMMTRSM3M_U_UKERNEL +#define BLIS_CGEMMTRSM3M_U_UKERNEL BLIS_CGEMMTRSM3M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM3M_U_UKERNEL +#define BLIS_ZGEMMTRSM3M_U_UKERNEL BLIS_ZGEMMTRSM3M_U_UKERNEL_REF +#endif + +// trsm3m_l micro-kernels + +#ifndef BLIS_CTRSM3M_L_UKERNEL +#define BLIS_CTRSM3M_L_UKERNEL BLIS_CTRSM3M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM3M_L_UKERNEL +#define BLIS_ZTRSM3M_L_UKERNEL BLIS_ZTRSM3M_L_UKERNEL_REF +#endif + +// trsm3m_u micro-kernels + +#ifndef BLIS_CTRSM3M_U_UKERNEL +#define BLIS_CTRSM3M_U_UKERNEL BLIS_CTRSM3M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM3M_U_UKERNEL +#define BLIS_ZTRSM3M_U_UKERNEL BLIS_ZTRSM3M_U_UKERNEL_REF +#endif + +// +// Level-1m +// + +// packm_2xk_ri3 kernels + +#ifndef BLIS_SPACKM_2XK_RI3_KERNEL +#define BLIS_SPACKM_2XK_RI3_KERNEL BLIS_SPACKM_2XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_2XK_RI3_KERNEL +#define BLIS_DPACKM_2XK_RI3_KERNEL BLIS_DPACKM_2XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_2XK_RI3_KERNEL +#define BLIS_CPACKM_2XK_RI3_KERNEL BLIS_CPACKM_2XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_RI3_KERNEL +#define BLIS_ZPACKM_2XK_RI3_KERNEL BLIS_ZPACKM_2XK_RI3_KERNEL_REF +#endif + +// packm_4xk_ri3 kernels + +#ifndef BLIS_SPACKM_4XK_RI3_KERNEL +#define BLIS_SPACKM_4XK_RI3_KERNEL BLIS_SPACKM_4XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_4XK_RI3_KERNEL +#define BLIS_DPACKM_4XK_RI3_KERNEL BLIS_DPACKM_4XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_4XK_RI3_KERNEL +#define BLIS_CPACKM_4XK_RI3_KERNEL BLIS_CPACKM_4XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_RI3_KERNEL +#define BLIS_ZPACKM_4XK_RI3_KERNEL BLIS_ZPACKM_4XK_RI3_KERNEL_REF +#endif + +// packm_6xk_ri3 kernels + +#ifndef BLIS_SPACKM_6XK_RI3_KERNEL +#define BLIS_SPACKM_6XK_RI3_KERNEL BLIS_SPACKM_6XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_6XK_RI3_KERNEL +#define BLIS_DPACKM_6XK_RI3_KERNEL BLIS_DPACKM_6XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_6XK_RI3_KERNEL +#define BLIS_CPACKM_6XK_RI3_KERNEL BLIS_CPACKM_6XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_RI3_KERNEL +#define BLIS_ZPACKM_6XK_RI3_KERNEL BLIS_ZPACKM_6XK_RI3_KERNEL_REF +#endif + +// packm_8xk_ri3 kernels + +#ifndef BLIS_SPACKM_8XK_RI3_KERNEL +#define BLIS_SPACKM_8XK_RI3_KERNEL BLIS_SPACKM_8XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_8XK_RI3_KERNEL +#define BLIS_DPACKM_8XK_RI3_KERNEL BLIS_DPACKM_8XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_8XK_RI3_KERNEL +#define BLIS_CPACKM_8XK_RI3_KERNEL BLIS_CPACKM_8XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_RI3_KERNEL +#define BLIS_ZPACKM_8XK_RI3_KERNEL BLIS_ZPACKM_8XK_RI3_KERNEL_REF +#endif + +// packm_10xk_ri3 kernels + +#ifndef BLIS_SPACKM_10XK_RI3_KERNEL +#define BLIS_SPACKM_10XK_RI3_KERNEL BLIS_SPACKM_10XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_10XK_RI3_KERNEL +#define BLIS_DPACKM_10XK_RI3_KERNEL BLIS_DPACKM_10XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_10XK_RI3_KERNEL +#define BLIS_CPACKM_10XK_RI3_KERNEL BLIS_CPACKM_10XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_RI3_KERNEL +#define BLIS_ZPACKM_10XK_RI3_KERNEL BLIS_ZPACKM_10XK_RI3_KERNEL_REF +#endif + +// packm_12xk_ri3 kernels + +#ifndef BLIS_SPACKM_12XK_RI3_KERNEL +#define BLIS_SPACKM_12XK_RI3_KERNEL BLIS_SPACKM_12XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_12XK_RI3_KERNEL +#define BLIS_DPACKM_12XK_RI3_KERNEL BLIS_DPACKM_12XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_12XK_RI3_KERNEL +#define BLIS_CPACKM_12XK_RI3_KERNEL BLIS_CPACKM_12XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_RI3_KERNEL +#define BLIS_ZPACKM_12XK_RI3_KERNEL BLIS_ZPACKM_12XK_RI3_KERNEL_REF +#endif + +// packm_14xk_ri3 kernels + +#ifndef BLIS_SPACKM_14XK_RI3_KERNEL +#define BLIS_SPACKM_14XK_RI3_KERNEL BLIS_SPACKM_14XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_14XK_RI3_KERNEL +#define BLIS_DPACKM_14XK_RI3_KERNEL BLIS_DPACKM_14XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_14XK_RI3_KERNEL +#define BLIS_CPACKM_14XK_RI3_KERNEL BLIS_CPACKM_14XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_RI3_KERNEL +#define BLIS_ZPACKM_14XK_RI3_KERNEL BLIS_ZPACKM_14XK_RI3_KERNEL_REF +#endif + +// packm_16xk_ri3 kernels + +#ifndef BLIS_SPACKM_16XK_RI3_KERNEL +#define BLIS_SPACKM_16XK_RI3_KERNEL BLIS_SPACKM_16XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_16XK_RI3_KERNEL +#define BLIS_DPACKM_16XK_RI3_KERNEL BLIS_DPACKM_16XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_16XK_RI3_KERNEL +#define BLIS_CPACKM_16XK_RI3_KERNEL BLIS_CPACKM_16XK_RI3_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_RI3_KERNEL +#define BLIS_ZPACKM_16XK_RI3_KERNEL BLIS_ZPACKM_16XK_RI3_KERNEL_REF +#endif + + + +// -- Define default 3m-specific blocksize macros ------------------------------ + +// Define complex 3m register blocksizes in terms of blocksizes used for +// real kernels. + +// 3m register blocksizes +#define BLIS_DEFAULT_3M_MR_C BLIS_DEFAULT_MR_S +#define BLIS_DEFAULT_3M_KR_C BLIS_DEFAULT_KR_S +#define BLIS_DEFAULT_3M_NR_C BLIS_DEFAULT_NR_S + +#define BLIS_DEFAULT_3M_MR_Z BLIS_DEFAULT_MR_D +#define BLIS_DEFAULT_3M_KR_Z BLIS_DEFAULT_KR_D +#define BLIS_DEFAULT_3M_NR_Z BLIS_DEFAULT_NR_D + +// 3m register blocksize extensions +#define BLIS_EXTEND_3M_MR_C BLIS_EXTEND_MR_S +#define BLIS_EXTEND_3M_KR_C 0 +#define BLIS_EXTEND_3M_NR_C BLIS_EXTEND_NR_S + +#define BLIS_EXTEND_3M_MR_Z BLIS_EXTEND_MR_D +#define BLIS_EXTEND_3M_KR_Z 0 +#define BLIS_EXTEND_3M_NR_Z BLIS_EXTEND_NR_D + +// Define complex 3m cache blocksizes in terms of blocksizes used for +// real operations (if they have not yet already been defined). + +// 3m cache blocksizes +#ifndef BLIS_DEFAULT_3M_MC_C +#define BLIS_DEFAULT_3M_MC_C ((BLIS_DEFAULT_MC_S)/1) +#endif +#ifndef BLIS_DEFAULT_3M_KC_C +#define BLIS_DEFAULT_3M_KC_C ((BLIS_DEFAULT_KC_S)/2) +#endif +#ifndef BLIS_DEFAULT_3M_NC_C +#define BLIS_DEFAULT_3M_NC_C ((BLIS_DEFAULT_NC_S)/1) +#endif + +#ifndef BLIS_DEFAULT_3M_MC_Z +#define BLIS_DEFAULT_3M_MC_Z ((BLIS_DEFAULT_MC_D)/1) +#endif +#ifndef BLIS_DEFAULT_3M_KC_Z +#define BLIS_DEFAULT_3M_KC_Z ((BLIS_DEFAULT_KC_D)/2) +#endif +#ifndef BLIS_DEFAULT_3M_NC_Z +#define BLIS_DEFAULT_3M_NC_Z ((BLIS_DEFAULT_NC_D)/1) +#endif + +// 3m cache blocksize extensions +#ifndef BLIS_EXTEND_3M_MC_C +#define BLIS_EXTEND_3M_MC_C 0 +#endif +#ifndef BLIS_EXTEND_3M_KC_C +#define BLIS_EXTEND_3M_KC_C 0 +#endif +#ifndef BLIS_EXTEND_3M_NC_C +#define BLIS_EXTEND_3M_NC_C 0 +#endif + +#ifndef BLIS_EXTEND_3M_MC_Z +#define BLIS_EXTEND_3M_MC_Z 0 +#endif +#ifndef BLIS_EXTEND_3M_KC_Z +#define BLIS_EXTEND_3M_KC_Z 0 +#endif +#ifndef BLIS_EXTEND_3M_NC_Z +#define BLIS_EXTEND_3M_NC_Z 0 +#endif + + + +// -- Kernel blocksize checks -------------------------------------------------- + +// Verify that cache blocksizes are whole multiples of register blocksizes. +// Specifically, verify that: +// - MC is a whole multiple of MR *AND* NR. +// - NC is a whole multiple of NR *AND* MR. +// - KC is a whole multiple of KR *AND* both MR, NR. +// These constraints are enforced because it makes it easier to handle diagonals +// in the macro-kernel implementations. + +// +// MC must be a whole multiple of MR and NR. +// +#if ( \ + ( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ + ) + #error "MC (3m) must be multiple of MR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_3M_MC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_MC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ + ) + #error "MC (3m) must be multiple of NR for all datatypes." +#endif + +// +// NC must be a whole multiple of NR and MR. +// +#if ( \ + ( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ + ) + #error "NC (3m) must be multiple of NR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_3M_NC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_NC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ + ) + #error "NC (3m) must be multiple of MR for all datatypes." +#endif + +// +// KC must be a whole multiple of KR, MR, and NR. +// +#if ( \ + ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_KR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_KR_Z != 0 ) \ + ) + #error "KC (3m) must be multiple of KR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_MR_Z != 0 ) \ + ) + #error "KC (3m) must be multiple of MR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_3M_KC_C % BLIS_DEFAULT_3M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_3M_KC_Z % BLIS_DEFAULT_3M_NR_Z != 0 ) \ + ) + #error "KC (3m) must be multiple of NR for all datatypes." +#endif + + + +// -- Compute extended blocksizes ---------------------------------------------- + +// +// Compute maximum cache blocksizes. +// + +#define BLIS_MAXIMUM_3M_MC_C ( BLIS_DEFAULT_3M_MC_C + BLIS_EXTEND_3M_MC_C ) +#define BLIS_MAXIMUM_3M_KC_C ( BLIS_DEFAULT_3M_KC_C + BLIS_EXTEND_3M_KC_C ) +#define BLIS_MAXIMUM_3M_NC_C ( BLIS_DEFAULT_3M_NC_C + BLIS_EXTEND_3M_NC_C ) + +#define BLIS_MAXIMUM_3M_MC_Z ( BLIS_DEFAULT_3M_MC_Z + BLIS_EXTEND_3M_MC_Z ) +#define BLIS_MAXIMUM_3M_KC_Z ( BLIS_DEFAULT_3M_KC_Z + BLIS_EXTEND_3M_KC_Z ) +#define BLIS_MAXIMUM_3M_NC_Z ( BLIS_DEFAULT_3M_NC_Z + BLIS_EXTEND_3M_NC_Z ) + +// +// Compute leading dimension blocksizes used when packing micro-panels. +// + +#define BLIS_PACKDIM_3M_MR_C ( BLIS_DEFAULT_3M_MR_C + BLIS_EXTEND_3M_MR_C ) +#define BLIS_PACKDIM_3M_KR_C ( BLIS_DEFAULT_3M_KR_C + BLIS_EXTEND_3M_KR_C ) +#define BLIS_PACKDIM_3M_NR_C ( BLIS_DEFAULT_3M_NR_C + BLIS_EXTEND_3M_NR_C ) + +#define BLIS_PACKDIM_3M_MR_Z ( BLIS_DEFAULT_3M_MR_Z + BLIS_EXTEND_3M_MR_Z ) +#define BLIS_PACKDIM_3M_KR_Z ( BLIS_DEFAULT_3M_KR_Z + BLIS_EXTEND_3M_KR_Z ) +#define BLIS_PACKDIM_3M_NR_Z ( BLIS_DEFAULT_3M_NR_Z + BLIS_EXTEND_3M_NR_Z ) + + + +#endif diff --git a/frame/include/bli_kernel_4m_macro_defs.h b/frame/include/bli_kernel_4m_macro_defs.h new file mode 100644 index 000000000..bb0cbd379 --- /dev/null +++ b/frame/include/bli_kernel_4m_macro_defs.h @@ -0,0 +1,416 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_4M_MACRO_DEFS_H +#define BLIS_KERNEL_4M_MACRO_DEFS_H + + +// -- Construct 4m kernel function names --------------------------------------- + +// +// Level-3 4m +// + +// gemm4m micro-kernels + +#ifndef BLIS_CGEMM4M_UKERNEL +#define BLIS_CGEMM4M_UKERNEL BLIS_CGEMM4M_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMM4M_UKERNEL +#define BLIS_ZGEMM4M_UKERNEL BLIS_ZGEMM4M_UKERNEL_REF +#endif + +// gemmtrsm4m_l micro-kernels + +#ifndef BLIS_CGEMMTRSM4M_L_UKERNEL +#define BLIS_CGEMMTRSM4M_L_UKERNEL BLIS_CGEMMTRSM4M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM4M_L_UKERNEL +#define BLIS_ZGEMMTRSM4M_L_UKERNEL BLIS_ZGEMMTRSM4M_L_UKERNEL_REF +#endif + +// gemmtrsm4m_u micro-kernels + +#ifndef BLIS_CGEMMTRSM4M_U_UKERNEL +#define BLIS_CGEMMTRSM4M_U_UKERNEL BLIS_CGEMMTRSM4M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM4M_U_UKERNEL +#define BLIS_ZGEMMTRSM4M_U_UKERNEL BLIS_ZGEMMTRSM4M_U_UKERNEL_REF +#endif + +// trsm4m_l micro-kernels + +#ifndef BLIS_CTRSM4M_L_UKERNEL +#define BLIS_CTRSM4M_L_UKERNEL BLIS_CTRSM4M_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM4M_L_UKERNEL +#define BLIS_ZTRSM4M_L_UKERNEL BLIS_ZTRSM4M_L_UKERNEL_REF +#endif + +// trsm4m_u micro-kernels + +#ifndef BLIS_CTRSM4M_U_UKERNEL +#define BLIS_CTRSM4M_U_UKERNEL BLIS_CTRSM4M_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM4M_U_UKERNEL +#define BLIS_ZTRSM4M_U_UKERNEL BLIS_ZTRSM4M_U_UKERNEL_REF +#endif + +// +// Level-1m +// + +// packm_2xk_ri kernels + +#ifndef BLIS_SPACKM_2XK_RI_KERNEL +#define BLIS_SPACKM_2XK_RI_KERNEL BLIS_SPACKM_2XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_2XK_RI_KERNEL +#define BLIS_DPACKM_2XK_RI_KERNEL BLIS_DPACKM_2XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_2XK_RI_KERNEL +#define BLIS_CPACKM_2XK_RI_KERNEL BLIS_CPACKM_2XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_RI_KERNEL +#define BLIS_ZPACKM_2XK_RI_KERNEL BLIS_ZPACKM_2XK_RI_KERNEL_REF +#endif + +// packm_4xk_ri kernels + +#ifndef BLIS_SPACKM_4XK_RI_KERNEL +#define BLIS_SPACKM_4XK_RI_KERNEL BLIS_SPACKM_4XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_4XK_RI_KERNEL +#define BLIS_DPACKM_4XK_RI_KERNEL BLIS_DPACKM_4XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_4XK_RI_KERNEL +#define BLIS_CPACKM_4XK_RI_KERNEL BLIS_CPACKM_4XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_RI_KERNEL +#define BLIS_ZPACKM_4XK_RI_KERNEL BLIS_ZPACKM_4XK_RI_KERNEL_REF +#endif + +// packm_6xk_ri kernels + +#ifndef BLIS_SPACKM_6XK_RI_KERNEL +#define BLIS_SPACKM_6XK_RI_KERNEL BLIS_SPACKM_6XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_6XK_RI_KERNEL +#define BLIS_DPACKM_6XK_RI_KERNEL BLIS_DPACKM_6XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_6XK_RI_KERNEL +#define BLIS_CPACKM_6XK_RI_KERNEL BLIS_CPACKM_6XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_RI_KERNEL +#define BLIS_ZPACKM_6XK_RI_KERNEL BLIS_ZPACKM_6XK_RI_KERNEL_REF +#endif + +// packm_8xk_ri kernels + +#ifndef BLIS_SPACKM_8XK_RI_KERNEL +#define BLIS_SPACKM_8XK_RI_KERNEL BLIS_SPACKM_8XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_8XK_RI_KERNEL +#define BLIS_DPACKM_8XK_RI_KERNEL BLIS_DPACKM_8XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_8XK_RI_KERNEL +#define BLIS_CPACKM_8XK_RI_KERNEL BLIS_CPACKM_8XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_RI_KERNEL +#define BLIS_ZPACKM_8XK_RI_KERNEL BLIS_ZPACKM_8XK_RI_KERNEL_REF +#endif + +// packm_10xk_ri kernels + +#ifndef BLIS_SPACKM_10XK_RI_KERNEL +#define BLIS_SPACKM_10XK_RI_KERNEL BLIS_SPACKM_10XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_10XK_RI_KERNEL +#define BLIS_DPACKM_10XK_RI_KERNEL BLIS_DPACKM_10XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_10XK_RI_KERNEL +#define BLIS_CPACKM_10XK_RI_KERNEL BLIS_CPACKM_10XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_RI_KERNEL +#define BLIS_ZPACKM_10XK_RI_KERNEL BLIS_ZPACKM_10XK_RI_KERNEL_REF +#endif + +// packm_12xk_ri kernels + +#ifndef BLIS_SPACKM_12XK_RI_KERNEL +#define BLIS_SPACKM_12XK_RI_KERNEL BLIS_SPACKM_12XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_12XK_RI_KERNEL +#define BLIS_DPACKM_12XK_RI_KERNEL BLIS_DPACKM_12XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_12XK_RI_KERNEL +#define BLIS_CPACKM_12XK_RI_KERNEL BLIS_CPACKM_12XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_RI_KERNEL +#define BLIS_ZPACKM_12XK_RI_KERNEL BLIS_ZPACKM_12XK_RI_KERNEL_REF +#endif + +// packm_14xk_ri kernels + +#ifndef BLIS_SPACKM_14XK_RI_KERNEL +#define BLIS_SPACKM_14XK_RI_KERNEL BLIS_SPACKM_14XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_14XK_RI_KERNEL +#define BLIS_DPACKM_14XK_RI_KERNEL BLIS_DPACKM_14XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_14XK_RI_KERNEL +#define BLIS_CPACKM_14XK_RI_KERNEL BLIS_CPACKM_14XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_RI_KERNEL +#define BLIS_ZPACKM_14XK_RI_KERNEL BLIS_ZPACKM_14XK_RI_KERNEL_REF +#endif + +// packm_16xk_ri kernels + +#ifndef BLIS_SPACKM_16XK_RI_KERNEL +#define BLIS_SPACKM_16XK_RI_KERNEL BLIS_SPACKM_16XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_16XK_RI_KERNEL +#define BLIS_DPACKM_16XK_RI_KERNEL BLIS_DPACKM_16XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_16XK_RI_KERNEL +#define BLIS_CPACKM_16XK_RI_KERNEL BLIS_CPACKM_16XK_RI_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_RI_KERNEL +#define BLIS_ZPACKM_16XK_RI_KERNEL BLIS_ZPACKM_16XK_RI_KERNEL_REF +#endif + + + +// -- Define default 4m-specific blocksize macros ------------------------------ + +// Define complex 4m register blocksizes in terms of blocksizes used for +// real kernels. + +// 4m register blocksizes +#define BLIS_DEFAULT_4M_MR_C BLIS_DEFAULT_MR_S +#define BLIS_DEFAULT_4M_KR_C BLIS_DEFAULT_KR_S +#define BLIS_DEFAULT_4M_NR_C BLIS_DEFAULT_NR_S + +#define BLIS_DEFAULT_4M_MR_Z BLIS_DEFAULT_MR_D +#define BLIS_DEFAULT_4M_KR_Z BLIS_DEFAULT_KR_D +#define BLIS_DEFAULT_4M_NR_Z BLIS_DEFAULT_NR_D + +// 4m register blocksize extensions +#define BLIS_EXTEND_4M_MR_C BLIS_EXTEND_MR_S +#define BLIS_EXTEND_4M_KR_C 0 +#define BLIS_EXTEND_4M_NR_C BLIS_EXTEND_NR_S + +#define BLIS_EXTEND_4M_MR_Z BLIS_EXTEND_MR_D +#define BLIS_EXTEND_4M_KR_Z 0 +#define BLIS_EXTEND_4M_NR_Z BLIS_EXTEND_NR_D + +// Define complex 4m cache blocksizes in terms of blocksizes used for +// real operations (if they have not yet already been defined). + +// 4m cache blocksizes +#ifndef BLIS_DEFAULT_4M_MC_C +#define BLIS_DEFAULT_4M_MC_C ((BLIS_DEFAULT_MC_S)/1) +#endif +#ifndef BLIS_DEFAULT_4M_KC_C +#define BLIS_DEFAULT_4M_KC_C ((BLIS_DEFAULT_KC_S)/2) +#endif +#ifndef BLIS_DEFAULT_4M_NC_C +#define BLIS_DEFAULT_4M_NC_C ((BLIS_DEFAULT_NC_S)/1) +#endif + +#ifndef BLIS_DEFAULT_4M_MC_Z +#define BLIS_DEFAULT_4M_MC_Z ((BLIS_DEFAULT_MC_D)/1) +#endif +#ifndef BLIS_DEFAULT_4M_KC_Z +#define BLIS_DEFAULT_4M_KC_Z ((BLIS_DEFAULT_KC_D)/2) +#endif +#ifndef BLIS_DEFAULT_4M_NC_Z +#define BLIS_DEFAULT_4M_NC_Z ((BLIS_DEFAULT_NC_D)/1) +#endif + +// 4m cache blocksize extensions +#ifndef BLIS_EXTEND_4M_MC_C +#define BLIS_EXTEND_4M_MC_C 0 +#endif +#ifndef BLIS_EXTEND_4M_KC_C +#define BLIS_EXTEND_4M_KC_C 0 +#endif +#ifndef BLIS_EXTEND_4M_NC_C +#define BLIS_EXTEND_4M_NC_C 0 +#endif + +#ifndef BLIS_EXTEND_4M_MC_Z +#define BLIS_EXTEND_4M_MC_Z 0 +#endif +#ifndef BLIS_EXTEND_4M_KC_Z +#define BLIS_EXTEND_4M_KC_Z 0 +#endif +#ifndef BLIS_EXTEND_4M_NC_Z +#define BLIS_EXTEND_4M_NC_Z 0 +#endif + + + +// -- Kernel blocksize checks -------------------------------------------------- + +// Verify that cache blocksizes are whole multiples of register blocksizes. +// Specifically, verify that: +// - MC is a whole multiple of MR *AND* NR. +// - NC is a whole multiple of NR *AND* MR. +// - KC is a whole multiple of KR *AND* both MR, NR. +// These constraints are enforced because it makes it easier to handle diagonals +// in the macro-kernel implementations. + +// +// MC must be a whole multiple of MR and NR. +// + +#if ( \ + ( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ + ) + #error "MC (4m) must be multiple of MR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_4M_MC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_MC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ + ) + #error "MC (4m) must be multiple of NR for all datatypes." +#endif + +// +// NC must be a whole multiple of NR and MR. +// + +#if ( \ + ( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ + ) + #error "NC (4m) must be multiple of NR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_4M_NC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_NC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ + ) + #error "NC (4m) must be multiple of MR for all datatypes." +#endif + +// +// KC must be a whole multiple of KR, MR, and NR. +// + +#if ( \ + ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_KR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_KR_Z != 0 ) \ + ) + #error "KC (4m) must be multiple of KR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_MR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_MR_Z != 0 ) \ + ) + #error "KC (4m) must be multiple of MR for all datatypes." +#endif + +#if ( \ + ( BLIS_DEFAULT_4M_KC_C % BLIS_DEFAULT_4M_NR_C != 0 ) || \ + ( BLIS_DEFAULT_4M_KC_Z % BLIS_DEFAULT_4M_NR_Z != 0 ) \ + ) + #error "KC (4m) must be multiple of NR for all datatypes." +#endif + + + +// -- Compute extended blocksizes ---------------------------------------------- + +// +// Compute maximum cache blocksizes. +// + +#define BLIS_MAXIMUM_4M_MC_C ( BLIS_DEFAULT_4M_MC_C + BLIS_EXTEND_4M_MC_C ) +#define BLIS_MAXIMUM_4M_KC_C ( BLIS_DEFAULT_4M_KC_C + BLIS_EXTEND_4M_KC_C ) +#define BLIS_MAXIMUM_4M_NC_C ( BLIS_DEFAULT_4M_NC_C + BLIS_EXTEND_4M_NC_C ) + +#define BLIS_MAXIMUM_4M_MC_Z ( BLIS_DEFAULT_4M_MC_Z + BLIS_EXTEND_4M_MC_Z ) +#define BLIS_MAXIMUM_4M_KC_Z ( BLIS_DEFAULT_4M_KC_Z + BLIS_EXTEND_4M_KC_Z ) +#define BLIS_MAXIMUM_4M_NC_Z ( BLIS_DEFAULT_4M_NC_Z + BLIS_EXTEND_4M_NC_Z ) + +// +// Compute leading dimension blocksizes used when packing micro-panels. +// + +#define BLIS_PACKDIM_4M_MR_C ( BLIS_DEFAULT_4M_MR_C + BLIS_EXTEND_4M_MR_C ) +#define BLIS_PACKDIM_4M_KR_C ( BLIS_DEFAULT_4M_KR_C + BLIS_EXTEND_4M_KR_C ) +#define BLIS_PACKDIM_4M_NR_C ( BLIS_DEFAULT_4M_NR_C + BLIS_EXTEND_4M_NR_C ) + +#define BLIS_PACKDIM_4M_MR_Z ( BLIS_DEFAULT_4M_MR_Z + BLIS_EXTEND_4M_MR_Z ) +#define BLIS_PACKDIM_4M_KR_Z ( BLIS_DEFAULT_4M_KR_Z + BLIS_EXTEND_4M_KR_Z ) +#define BLIS_PACKDIM_4M_NR_Z ( BLIS_DEFAULT_4M_NR_Z + BLIS_EXTEND_4M_NR_Z ) + + + +#endif diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 68fd8a64e..1381babb0 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -36,114 +36,1127 @@ #define BLIS_KERNEL_MACRO_DEFS_H -// -- Construct base kernel names ---------------------------------------------- - -// If any datatype-specific base name for a given micro-kernel does not -// exist, default to using the datatype-agnostic base name. - -// gemm micro-kernel base names -#ifndef SGEMM_UKERNEL -#define SGEMM_UKERNEL GEMM_UKERNEL -#endif -#ifndef DGEMM_UKERNEL -#define DGEMM_UKERNEL GEMM_UKERNEL -#endif -#ifndef CGEMM_UKERNEL -#define CGEMM_UKERNEL GEMM_UKERNEL -#endif -#ifndef ZGEMM_UKERNEL -#define ZGEMM_UKERNEL GEMM_UKERNEL -#endif - -// trsm_l micro-kernel base names -#ifndef STRSM_L_UKERNEL -#define STRSM_L_UKERNEL TRSM_L_UKERNEL -#endif -#ifndef DTRSM_L_UKERNEL -#define DTRSM_L_UKERNEL TRSM_L_UKERNEL -#endif -#ifndef CTRSM_L_UKERNEL -#define CTRSM_L_UKERNEL TRSM_L_UKERNEL -#endif -#ifndef ZTRSM_L_UKERNEL -#define ZTRSM_L_UKERNEL TRSM_L_UKERNEL -#endif - -// trsm_u micro-kernel base names -#ifndef STRSM_U_UKERNEL -#define STRSM_U_UKERNEL TRSM_U_UKERNEL -#endif -#ifndef DTRSM_U_UKERNEL -#define DTRSM_U_UKERNEL TRSM_U_UKERNEL -#endif -#ifndef CTRSM_U_UKERNEL -#define CTRSM_U_UKERNEL TRSM_U_UKERNEL -#endif -#ifndef ZTRSM_U_UKERNEL -#define ZTRSM_U_UKERNEL TRSM_U_UKERNEL -#endif - -// gemmtrsm_l micro-kernel base names -#ifndef SGEMMTRSM_L_UKERNEL -#define SGEMMTRSM_L_UKERNEL GEMMTRSM_L_UKERNEL -#endif -#ifndef DGEMMTRSM_L_UKERNEL -#define DGEMMTRSM_L_UKERNEL GEMMTRSM_L_UKERNEL -#endif -#ifndef CGEMMTRSM_L_UKERNEL -#define CGEMMTRSM_L_UKERNEL GEMMTRSM_L_UKERNEL -#endif -#ifndef ZGEMMTRSM_L_UKERNEL -#define ZGEMMTRSM_L_UKERNEL GEMMTRSM_L_UKERNEL -#endif - -// gemmtrsm_u micro-kernel base names -#ifndef SGEMMTRSM_U_UKERNEL -#define SGEMMTRSM_U_UKERNEL GEMMTRSM_U_UKERNEL -#endif -#ifndef DGEMMTRSM_U_UKERNEL -#define DGEMMTRSM_U_UKERNEL GEMMTRSM_U_UKERNEL -#endif -#ifndef CGEMMTRSM_U_UKERNEL -#define CGEMMTRSM_U_UKERNEL GEMMTRSM_U_UKERNEL -#endif -#ifndef ZGEMMTRSM_U_UKERNEL -#define ZGEMMTRSM_U_UKERNEL GEMMTRSM_U_UKERNEL -#endif - - // -- Construct kernel function names ------------------------------------------ -// For each datatype-specific micro-kernel base name, construct the full -// function name of the corresponding micro-kernel. +// In this section we consider each datatype-specific micro-kernel macro; +// if it is undefined, we define it to be the corresponding reference kernel. +// In the case of complex gemm micro-kernels, we also define special _VIA_4M +// macros so that later on we can tell whether or not to employ the 4m +// implementations. Note that in order to properly determine whether 4m is a +// viable option, we need to be able to test the existence of the real gemm +// micro-kernels, which means we must consider the complex gemm micro-kernel +// cases *BEFORE* the real cases. -#define BLIS_SGEMM_UKERNEL PASTEMAC(s,SGEMM_UKERNEL) -#define BLIS_DGEMM_UKERNEL PASTEMAC(d,DGEMM_UKERNEL) -#define BLIS_CGEMM_UKERNEL PASTEMAC(c,CGEMM_UKERNEL) -#define BLIS_ZGEMM_UKERNEL PASTEMAC(z,ZGEMM_UKERNEL) +// +// Level-3 +// -#define BLIS_STRSM_L_UKERNEL PASTEMAC(s,STRSM_L_UKERNEL) -#define BLIS_DTRSM_L_UKERNEL PASTEMAC(d,DTRSM_L_UKERNEL) -#define BLIS_CTRSM_L_UKERNEL PASTEMAC(c,CTRSM_L_UKERNEL) -#define BLIS_ZTRSM_L_UKERNEL PASTEMAC(z,ZTRSM_L_UKERNEL) +// gemm micro-kernels -#define BLIS_STRSM_U_UKERNEL PASTEMAC(s,STRSM_U_UKERNEL) -#define BLIS_DTRSM_U_UKERNEL PASTEMAC(d,DTRSM_U_UKERNEL) -#define BLIS_CTRSM_U_UKERNEL PASTEMAC(c,CTRSM_U_UKERNEL) -#define BLIS_ZTRSM_U_UKERNEL PASTEMAC(z,ZTRSM_U_UKERNEL) +#ifndef BLIS_CGEMM_UKERNEL +#define BLIS_CGEMM_UKERNEL BLIS_CGEMM_UKERNEL_REF +#ifdef BLIS_SGEMM_UKERNEL +#define BLIS_ENABLE_SCOMPLEX_VIA_4M +#endif +#endif -#define BLIS_SGEMMTRSM_L_UKERNEL PASTEMAC(s,SGEMMTRSM_L_UKERNEL) -#define BLIS_DGEMMTRSM_L_UKERNEL PASTEMAC(d,DGEMMTRSM_L_UKERNEL) -#define BLIS_CGEMMTRSM_L_UKERNEL PASTEMAC(c,CGEMMTRSM_L_UKERNEL) -#define BLIS_ZGEMMTRSM_L_UKERNEL PASTEMAC(z,ZGEMMTRSM_L_UKERNEL) +#ifndef BLIS_ZGEMM_UKERNEL +#define BLIS_ZGEMM_UKERNEL BLIS_ZGEMM_UKERNEL_REF +#ifdef BLIS_DGEMM_UKERNEL +#define BLIS_ENABLE_DCOMPLEX_VIA_4M +#endif +#endif -#define BLIS_SGEMMTRSM_U_UKERNEL PASTEMAC(s,SGEMMTRSM_U_UKERNEL) -#define BLIS_DGEMMTRSM_U_UKERNEL PASTEMAC(d,DGEMMTRSM_U_UKERNEL) -#define BLIS_CGEMMTRSM_U_UKERNEL PASTEMAC(c,CGEMMTRSM_U_UKERNEL) -#define BLIS_ZGEMMTRSM_U_UKERNEL PASTEMAC(z,ZGEMMTRSM_U_UKERNEL) +#ifndef BLIS_SGEMM_UKERNEL +#define BLIS_SGEMM_UKERNEL BLIS_SGEMM_UKERNEL_REF +#endif + +#ifndef BLIS_DGEMM_UKERNEL +#define BLIS_DGEMM_UKERNEL BLIS_DGEMM_UKERNEL_REF +#endif + +// gemmtrsm_l micro-kernels + +#ifndef BLIS_SGEMMTRSM_L_UKERNEL +#define BLIS_SGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_DGEMMTRSM_L_UKERNEL +#define BLIS_DGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_CGEMMTRSM_L_UKERNEL +#define BLIS_CGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM_L_UKERNEL +#define BLIS_ZGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL_REF +#endif + +// gemmtrsm_u micro-kernels + +#ifndef BLIS_SGEMMTRSM_U_UKERNEL +#define BLIS_SGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_DGEMMTRSM_U_UKERNEL +#define BLIS_DGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_CGEMMTRSM_U_UKERNEL +#define BLIS_CGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZGEMMTRSM_U_UKERNEL +#define BLIS_ZGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL_REF +#endif + +// trsm_l micro-kernels + +#ifndef BLIS_STRSM_L_UKERNEL +#define BLIS_STRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_DTRSM_L_UKERNEL +#define BLIS_DTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_CTRSM_L_UKERNEL +#define BLIS_CTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM_L_UKERNEL +#define BLIS_ZTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL_REF +#endif + +// trsm_u micro-kernels + +#ifndef BLIS_STRSM_U_UKERNEL +#define BLIS_STRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_DTRSM_U_UKERNEL +#define BLIS_DTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_CTRSM_U_UKERNEL +#define BLIS_CTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL_REF +#endif + +#ifndef BLIS_ZTRSM_U_UKERNEL +#define BLIS_ZTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL_REF +#endif + +// +// Level-1m +// + +// packm_2xk kernels + +#ifndef BLIS_SPACKM_2XK_KERNEL +#define BLIS_SPACKM_2XK_KERNEL BLIS_SPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_2XK_KERNEL +#define BLIS_DPACKM_2XK_KERNEL BLIS_DPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_2XK_KERNEL +#define BLIS_CPACKM_2XK_KERNEL BLIS_CPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_2XK_KERNEL +#define BLIS_ZPACKM_2XK_KERNEL BLIS_ZPACKM_2XK_KERNEL_REF +#endif + +// packm_4xk kernels + +#ifndef BLIS_SPACKM_4XK_KERNEL +#define BLIS_SPACKM_4XK_KERNEL BLIS_SPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_4XK_KERNEL +#define BLIS_DPACKM_4XK_KERNEL BLIS_DPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_4XK_KERNEL +#define BLIS_CPACKM_4XK_KERNEL BLIS_CPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_4XK_KERNEL +#define BLIS_ZPACKM_4XK_KERNEL BLIS_ZPACKM_4XK_KERNEL_REF +#endif + +// packm_6xk kernels + +#ifndef BLIS_SPACKM_6XK_KERNEL +#define BLIS_SPACKM_6XK_KERNEL BLIS_SPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_6XK_KERNEL +#define BLIS_DPACKM_6XK_KERNEL BLIS_DPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_6XK_KERNEL +#define BLIS_CPACKM_6XK_KERNEL BLIS_CPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_6XK_KERNEL +#define BLIS_ZPACKM_6XK_KERNEL BLIS_ZPACKM_6XK_KERNEL_REF +#endif + +// packm_8xk kernels + +#ifndef BLIS_SPACKM_8XK_KERNEL +#define BLIS_SPACKM_8XK_KERNEL BLIS_SPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_8XK_KERNEL +#define BLIS_DPACKM_8XK_KERNEL BLIS_DPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_8XK_KERNEL +#define BLIS_CPACKM_8XK_KERNEL BLIS_CPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_8XK_KERNEL +#define BLIS_ZPACKM_8XK_KERNEL BLIS_ZPACKM_8XK_KERNEL_REF +#endif + +// packm_10xk kernels + +#ifndef BLIS_SPACKM_10XK_KERNEL +#define BLIS_SPACKM_10XK_KERNEL BLIS_SPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_10XK_KERNEL +#define BLIS_DPACKM_10XK_KERNEL BLIS_DPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_10XK_KERNEL +#define BLIS_CPACKM_10XK_KERNEL BLIS_CPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_10XK_KERNEL +#define BLIS_ZPACKM_10XK_KERNEL BLIS_ZPACKM_10XK_KERNEL_REF +#endif + +// packm_12xk kernels + +#ifndef BLIS_SPACKM_12XK_KERNEL +#define BLIS_SPACKM_12XK_KERNEL BLIS_SPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_12XK_KERNEL +#define BLIS_DPACKM_12XK_KERNEL BLIS_DPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_12XK_KERNEL +#define BLIS_CPACKM_12XK_KERNEL BLIS_CPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_12XK_KERNEL +#define BLIS_ZPACKM_12XK_KERNEL BLIS_ZPACKM_12XK_KERNEL_REF +#endif + +// packm_14xk kernels + +#ifndef BLIS_SPACKM_14XK_KERNEL +#define BLIS_SPACKM_14XK_KERNEL BLIS_SPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_14XK_KERNEL +#define BLIS_DPACKM_14XK_KERNEL BLIS_DPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_14XK_KERNEL +#define BLIS_CPACKM_14XK_KERNEL BLIS_CPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_14XK_KERNEL +#define BLIS_ZPACKM_14XK_KERNEL BLIS_ZPACKM_14XK_KERNEL_REF +#endif + +// packm_16xk kernels + +#ifndef BLIS_SPACKM_16XK_KERNEL +#define BLIS_SPACKM_16XK_KERNEL BLIS_SPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_DPACKM_16XK_KERNEL +#define BLIS_DPACKM_16XK_KERNEL BLIS_DPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_CPACKM_16XK_KERNEL +#define BLIS_CPACKM_16XK_KERNEL BLIS_CPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_ZPACKM_16XK_KERNEL +#define BLIS_ZPACKM_16XK_KERNEL BLIS_ZPACKM_16XK_KERNEL_REF +#endif + +// unpackm_2xk kernels + +#ifndef BLIS_SUNPACKM_2XK_KERNEL +#define BLIS_SUNPACKM_2XK_KERNEL BLIS_SUNPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_2XK_KERNEL +#define BLIS_DUNPACKM_2XK_KERNEL BLIS_DUNPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_2XK_KERNEL +#define BLIS_CUNPACKM_2XK_KERNEL BLIS_CUNPACKM_2XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_2XK_KERNEL +#define BLIS_ZUNPACKM_2XK_KERNEL BLIS_ZUNPACKM_2XK_KERNEL_REF +#endif + +// unpackm_4xk kernels + +#ifndef BLIS_SUNPACKM_4XK_KERNEL +#define BLIS_SUNPACKM_4XK_KERNEL BLIS_SUNPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_4XK_KERNEL +#define BLIS_DUNPACKM_4XK_KERNEL BLIS_DUNPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_4XK_KERNEL +#define BLIS_CUNPACKM_4XK_KERNEL BLIS_CUNPACKM_4XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_4XK_KERNEL +#define BLIS_ZUNPACKM_4XK_KERNEL BLIS_ZUNPACKM_4XK_KERNEL_REF +#endif + +// unpackm_6xk kernels + +#ifndef BLIS_SUNPACKM_6XK_KERNEL +#define BLIS_SUNPACKM_6XK_KERNEL BLIS_SUNPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_6XK_KERNEL +#define BLIS_DUNPACKM_6XK_KERNEL BLIS_DUNPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_6XK_KERNEL +#define BLIS_CUNPACKM_6XK_KERNEL BLIS_CUNPACKM_6XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_6XK_KERNEL +#define BLIS_ZUNPACKM_6XK_KERNEL BLIS_ZUNPACKM_6XK_KERNEL_REF +#endif + +// unpackm_8xk kernels + +#ifndef BLIS_SUNPACKM_8XK_KERNEL +#define BLIS_SUNPACKM_8XK_KERNEL BLIS_SUNPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_8XK_KERNEL +#define BLIS_DUNPACKM_8XK_KERNEL BLIS_DUNPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_8XK_KERNEL +#define BLIS_CUNPACKM_8XK_KERNEL BLIS_CUNPACKM_8XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_8XK_KERNEL +#define BLIS_ZUNPACKM_8XK_KERNEL BLIS_ZUNPACKM_8XK_KERNEL_REF +#endif + +// unpackm_10xk kernels + +#ifndef BLIS_SUNPACKM_10XK_KERNEL +#define BLIS_SUNPACKM_10XK_KERNEL BLIS_SUNPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_10XK_KERNEL +#define BLIS_DUNPACKM_10XK_KERNEL BLIS_DUNPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_10XK_KERNEL +#define BLIS_CUNPACKM_10XK_KERNEL BLIS_CUNPACKM_10XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_10XK_KERNEL +#define BLIS_ZUNPACKM_10XK_KERNEL BLIS_ZUNPACKM_10XK_KERNEL_REF +#endif + +// unpackm_12xk kernels + +#ifndef BLIS_SUNPACKM_12XK_KERNEL +#define BLIS_SUNPACKM_12XK_KERNEL BLIS_SUNPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_12XK_KERNEL +#define BLIS_DUNPACKM_12XK_KERNEL BLIS_DUNPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_12XK_KERNEL +#define BLIS_CUNPACKM_12XK_KERNEL BLIS_CUNPACKM_12XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_12XK_KERNEL +#define BLIS_ZUNPACKM_12XK_KERNEL BLIS_ZUNPACKM_12XK_KERNEL_REF +#endif + +// unpackm_14xk kernels + +#ifndef BLIS_SUNPACKM_14XK_KERNEL +#define BLIS_SUNPACKM_14XK_KERNEL BLIS_SUNPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_14XK_KERNEL +#define BLIS_DUNPACKM_14XK_KERNEL BLIS_DUNPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_14XK_KERNEL +#define BLIS_CUNPACKM_14XK_KERNEL BLIS_CUNPACKM_14XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_14XK_KERNEL +#define BLIS_ZUNPACKM_14XK_KERNEL BLIS_ZUNPACKM_14XK_KERNEL_REF +#endif + +// unpackm_16xk kernels + +#ifndef BLIS_SUNPACKM_16XK_KERNEL +#define BLIS_SUNPACKM_16XK_KERNEL BLIS_SUNPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_DUNPACKM_16XK_KERNEL +#define BLIS_DUNPACKM_16XK_KERNEL BLIS_DUNPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_CUNPACKM_16XK_KERNEL +#define BLIS_CUNPACKM_16XK_KERNEL BLIS_CUNPACKM_16XK_KERNEL_REF +#endif + +#ifndef BLIS_ZUNPACKM_16XK_KERNEL +#define BLIS_ZUNPACKM_16XK_KERNEL BLIS_ZUNPACKM_16XK_KERNEL_REF +#endif + +// +// Level-1f +// + +// axpy2v kernels + +//#ifndef AXPY2V_KERNEL +//#define AXPY2V_KERNEL AXPY2V_KERNEL_REF +//#endif + +#ifndef BLIS_SAXPY2V_KERNEL +#define BLIS_SAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL_REF +#endif + +#ifndef BLIS_DAXPY2V_KERNEL +#define BLIS_DAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL_REF +#endif + +#ifndef BLIS_CAXPY2V_KERNEL +#define BLIS_CAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL_REF +#endif + +#ifndef BLIS_ZAXPY2V_KERNEL +#define BLIS_ZAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL_REF +#endif + +// dotaxpyv kernels + +//#ifndef DOTAXPYV_KERNEL +//#define DOTAXPYV_KERNEL DOTAXPYV_KERNEL_REF +//#endif + +#ifndef BLIS_SDOTAXPYV_KERNEL +#define BLIS_SDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_DDOTAXPYV_KERNEL +#define BLIS_DDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_CDOTAXPYV_KERNEL +#define BLIS_CDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_ZDOTAXPYV_KERNEL +#define BLIS_ZDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL_REF +#endif + +// axpyf kernels + +//#ifndef AXPYF_KERNEL +//#define AXPYF_KERNEL AXPYF_KERNEL_REF +//#endif + +#ifndef BLIS_SAXPYF_KERNEL +#define BLIS_SAXPYF_KERNEL BLIS_SAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_DAXPYF_KERNEL +#define BLIS_DAXPYF_KERNEL BLIS_DAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_CAXPYF_KERNEL +#define BLIS_CAXPYF_KERNEL BLIS_CAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_ZAXPYF_KERNEL +#define BLIS_ZAXPYF_KERNEL BLIS_ZAXPYF_KERNEL_REF +#endif + +// dotxf kernels + +//#ifndef DOTXF_KERNEL +//#define DOTXF_KERNEL DOTXF_KERNEL_REF +//#endif + +#ifndef BLIS_SDOTXF_KERNEL +#define BLIS_SDOTXF_KERNEL BLIS_SDOTXF_KERNEL_REF +#endif + +#ifndef BLIS_DDOTXF_KERNEL +#define BLIS_DDOTXF_KERNEL BLIS_DDOTXF_KERNEL_REF +#endif + +#ifndef BLIS_CDOTXF_KERNEL +#define BLIS_CDOTXF_KERNEL BLIS_CDOTXF_KERNEL_REF +#endif + +#ifndef BLIS_ZDOTXF_KERNEL +#define BLIS_ZDOTXF_KERNEL BLIS_ZDOTXF_KERNEL_REF +#endif + +// dotxaxpyf kernels + +//#ifndef DOTXAXPYF_KERNEL +//#define DOTXAXPYF_KERNEL DOTXAXPYF_KERNEL_REF +//#endif + +#ifndef BLIS_SDOTXAXPYF_KERNEL +#define BLIS_SDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_DDOTXAXPYF_KERNEL +#define BLIS_DDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_CDOTXAXPYF_KERNEL +#define BLIS_CDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL_REF +#endif + +#ifndef BLIS_ZDOTXAXPYF_KERNEL +#define BLIS_ZDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL_REF +#endif + +// +// Level-1v +// + +// addv kernels + +//#ifndef ADDV_KERNEL +//#define ADDV_KERNEL ADDV_KERNEL_REF +//#endif + +#ifndef BLIS_SADDV_KERNEL +#define BLIS_SADDV_KERNEL BLIS_SADDV_KERNEL_REF +#endif + +#ifndef BLIS_DADDV_KERNEL +#define BLIS_DADDV_KERNEL BLIS_DADDV_KERNEL_REF +#endif + +#ifndef BLIS_CADDV_KERNEL +#define BLIS_CADDV_KERNEL BLIS_CADDV_KERNEL_REF +#endif + +#ifndef BLIS_ZADDV_KERNEL +#define BLIS_ZADDV_KERNEL BLIS_ZADDV_KERNEL_REF +#endif + +// axpyv kernels + +//#ifndef AXPYV_KERNEL +//#define AXPYV_KERNEL AXPYV_KERNEL_REF +//#endif + +#ifndef BLIS_SAXPYV_KERNEL +#define BLIS_SAXPYV_KERNEL BLIS_SAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_DAXPYV_KERNEL +#define BLIS_DAXPYV_KERNEL BLIS_DAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_CAXPYV_KERNEL +#define BLIS_CAXPYV_KERNEL BLIS_CAXPYV_KERNEL_REF +#endif + +#ifndef BLIS_ZAXPYV_KERNEL +#define BLIS_ZAXPYV_KERNEL BLIS_ZAXPYV_KERNEL_REF +#endif + +// copyv kernels + +//#ifndef COPYV_KERNEL +//#define COPYV_KERNEL COPYV_KERNEL_REF +//#endif + +#ifndef BLIS_SCOPYV_KERNEL +#define BLIS_SCOPYV_KERNEL BLIS_SCOPYV_KERNEL_REF +#endif + +#ifndef BLIS_DCOPYV_KERNEL +#define BLIS_DCOPYV_KERNEL BLIS_DCOPYV_KERNEL_REF +#endif + +#ifndef BLIS_CCOPYV_KERNEL +#define BLIS_CCOPYV_KERNEL BLIS_CCOPYV_KERNEL_REF +#endif + +#ifndef BLIS_ZCOPYV_KERNEL +#define BLIS_ZCOPYV_KERNEL BLIS_ZCOPYV_KERNEL_REF +#endif + +// dotv kernels + +//#ifndef DOTV_KERNEL +//#define DOTV_KERNEL DOTV_KERNEL_REF +//#endif + +#ifndef BLIS_SDOTV_KERNEL +#define BLIS_SDOTV_KERNEL BLIS_SDOTV_KERNEL_REF +#endif + +#ifndef BLIS_DDOTV_KERNEL +#define BLIS_DDOTV_KERNEL BLIS_DDOTV_KERNEL_REF +#endif + +#ifndef BLIS_CDOTV_KERNEL +#define BLIS_CDOTV_KERNEL BLIS_CDOTV_KERNEL_REF +#endif + +#ifndef BLIS_ZDOTV_KERNEL +#define BLIS_ZDOTV_KERNEL BLIS_ZDOTV_KERNEL_REF +#endif + +// dotxv kernels + +//#ifndef DOTXV_KERNEL +//#define DOTXV_KERNEL DOTXV_KERNEL_REF +//#endif + +#ifndef BLIS_SDOTXV_KERNEL +#define BLIS_SDOTXV_KERNEL BLIS_SDOTXV_KERNEL_REF +#endif + +#ifndef BLIS_DDOTXV_KERNEL +#define BLIS_DDOTXV_KERNEL BLIS_DDOTXV_KERNEL_REF +#endif + +#ifndef BLIS_CDOTXV_KERNEL +#define BLIS_CDOTXV_KERNEL BLIS_CDOTXV_KERNEL_REF +#endif + +#ifndef BLIS_ZDOTXV_KERNEL +#define BLIS_ZDOTXV_KERNEL BLIS_ZDOTXV_KERNEL_REF +#endif + +// invertv kernels + +//#ifndef INVERTV_KERNEL +//#define INVERTV_KERNEL INVERTV_KERNEL_REF +//#endif + +#ifndef BLIS_SINVERTV_KERNEL +#define BLIS_SINVERTV_KERNEL BLIS_SINVERTV_KERNEL_REF +#endif + +#ifndef BLIS_DINVERTV_KERNEL +#define BLIS_DINVERTV_KERNEL BLIS_DINVERTV_KERNEL_REF +#endif + +#ifndef BLIS_CINVERTV_KERNEL +#define BLIS_CINVERTV_KERNEL BLIS_CINVERTV_KERNEL_REF +#endif + +#ifndef BLIS_ZINVERTV_KERNEL +#define BLIS_ZINVERTV_KERNEL BLIS_ZINVERTV_KERNEL_REF +#endif + +// scal2v kernels + +//#ifndef SCAL2V_KERNEL +//#define SCAL2V_KERNEL SCAL2V_KERNEL_REF +//#endif + +#ifndef BLIS_SSCAL2V_KERNEL +#define BLIS_SSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL_REF +#endif + +#ifndef BLIS_DSCAL2V_KERNEL +#define BLIS_DSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL_REF +#endif + +#ifndef BLIS_CSCAL2V_KERNEL +#define BLIS_CSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL_REF +#endif + +#ifndef BLIS_ZSCAL2V_KERNEL +#define BLIS_ZSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL_REF +#endif + +// scalv kernels + +//#ifndef SCALV_KERNEL +//#define SCALV_KERNEL SCALV_KERNEL_REF +//#endif + +#ifndef BLIS_SSCALV_KERNEL +#define BLIS_SSCALV_KERNEL BLIS_SSCALV_KERNEL_REF +#endif + +#ifndef BLIS_DSCALV_KERNEL +#define BLIS_DSCALV_KERNEL BLIS_DSCALV_KERNEL_REF +#endif + +#ifndef BLIS_CSCALV_KERNEL +#define BLIS_CSCALV_KERNEL BLIS_CSCALV_KERNEL_REF +#endif + +#ifndef BLIS_ZSCALV_KERNEL +#define BLIS_ZSCALV_KERNEL BLIS_ZSCALV_KERNEL_REF +#endif + +// setv kernels + +//#ifndef SETV_KERNEL +//#define SETV_KERNEL SETV_KERNEL_REF +//#endif + +#ifndef BLIS_SSETV_KERNEL +#define BLIS_SSETV_KERNEL BLIS_SSETV_KERNEL_REF +#endif + +#ifndef BLIS_DSETV_KERNEL +#define BLIS_DSETV_KERNEL BLIS_DSETV_KERNEL_REF +#endif + +#ifndef BLIS_CSETV_KERNEL +#define BLIS_CSETV_KERNEL BLIS_CSETV_KERNEL_REF +#endif + +#ifndef BLIS_ZSETV_KERNEL +#define BLIS_ZSETV_KERNEL BLIS_ZSETV_KERNEL_REF +#endif + +// subv kernels + +//#ifndef SUBV_KERNEL +//#define SUBV_KERNEL SUBV_KERNEL_REF +//#endif + +#ifndef BLIS_SSUBV_KERNEL +#define BLIS_SSUBV_KERNEL BLIS_SSUBV_KERNEL_REF +#endif + +#ifndef BLIS_DSUBV_KERNEL +#define BLIS_DSUBV_KERNEL BLIS_DSUBV_KERNEL_REF +#endif + +#ifndef BLIS_CSUBV_KERNEL +#define BLIS_CSUBV_KERNEL BLIS_CSUBV_KERNEL_REF +#endif + +#ifndef BLIS_ZSUBV_KERNEL +#define BLIS_ZSUBV_KERNEL BLIS_ZSUBV_KERNEL_REF +#endif + +// swapv kernels + +//#ifndef SWAPV_KERNEL +//#define SWAPV_KERNEL SWAPV_KERNEL_REF +//#endif + +#ifndef BLIS_SSWAPV_KERNEL +#define BLIS_SSWAPV_KERNEL BLIS_SSWAPV_KERNEL_REF +#endif + +#ifndef BLIS_DSWAPV_KERNEL +#define BLIS_DSWAPV_KERNEL BLIS_DSWAPV_KERNEL_REF +#endif + +#ifndef BLIS_CSWAPV_KERNEL +#define BLIS_CSWAPV_KERNEL BLIS_CSWAPV_KERNEL_REF +#endif + +#ifndef BLIS_ZSWAPV_KERNEL +#define BLIS_ZSWAPV_KERNEL BLIS_ZSWAPV_KERNEL_REF +#endif -// -- Kernel macro checks ------------------------------------------------------ +// -- Define default blocksize macros ------------------------------------------ + +// +// Define level-3 cache blocksizes. +// + +// Define MC minimum + +#ifndef BLIS_DEFAULT_MC_S +#define BLIS_DEFAULT_MC_S 512 +#endif + +#ifndef BLIS_DEFAULT_MC_D +#define BLIS_DEFAULT_MC_D 256 +#endif + +#ifndef BLIS_DEFAULT_MC_C +#define BLIS_DEFAULT_MC_C 256 +#endif + +#ifndef BLIS_DEFAULT_MC_Z +#define BLIS_DEFAULT_MC_Z 128 +#endif + +// Define KC minimum + +#ifndef BLIS_DEFAULT_KC_S +#define BLIS_DEFAULT_KC_S 256 +#endif + +#ifndef BLIS_DEFAULT_KC_D +#define BLIS_DEFAULT_KC_D 256 +#endif + +#ifndef BLIS_DEFAULT_KC_C +#define BLIS_DEFAULT_KC_C 256 +#endif + +#ifndef BLIS_DEFAULT_KC_Z +#define BLIS_DEFAULT_KC_Z 256 +#endif + +// Define NC minimum + +#ifndef BLIS_DEFAULT_NC_S +#define BLIS_DEFAULT_NC_S 4096 +#endif + +#ifndef BLIS_DEFAULT_NC_D +#define BLIS_DEFAULT_NC_D 4096 +#endif + +#ifndef BLIS_DEFAULT_NC_C +#define BLIS_DEFAULT_NC_C 4096 +#endif + +#ifndef BLIS_DEFAULT_NC_Z +#define BLIS_DEFAULT_NC_Z 4096 +#endif + +// Define MC extension + +#ifndef BLIS_EXTEND_MC_S +#define BLIS_EXTEND_MC_S 0 +#endif + +#ifndef BLIS_EXTEND_MC_D +#define BLIS_EXTEND_MC_D 0 +#endif + +#ifndef BLIS_EXTEND_MC_C +#define BLIS_EXTEND_MC_C 0 +#endif + +#ifndef BLIS_EXTEND_MC_Z +#define BLIS_EXTEND_MC_Z 0 +#endif + +// Define KC extension + +#ifndef BLIS_EXTEND_KC_S +#define BLIS_EXTEND_KC_S 0 +#endif + +#ifndef BLIS_EXTEND_KC_D +#define BLIS_EXTEND_KC_D 0 +#endif + +#ifndef BLIS_EXTEND_KC_C +#define BLIS_EXTEND_KC_C 0 +#endif + +#ifndef BLIS_EXTEND_KC_Z +#define BLIS_EXTEND_KC_Z 0 +#endif + +// Define NC extension + +#ifndef BLIS_EXTEND_NC_S +#define BLIS_EXTEND_NC_S 0 +#endif + +#ifndef BLIS_EXTEND_NC_D +#define BLIS_EXTEND_NC_D 0 +#endif + +#ifndef BLIS_EXTEND_NC_C +#define BLIS_EXTEND_NC_C 0 +#endif + +#ifndef BLIS_EXTEND_NC_Z +#define BLIS_EXTEND_NC_Z 0 +#endif + +// +// Define level-3 register blocksizes. +// + +// Define MR + +#ifndef BLIS_DEFAULT_MR_S +#define BLIS_DEFAULT_MR_S 8 +#endif + +#ifndef BLIS_DEFAULT_MR_D +#define BLIS_DEFAULT_MR_D 4 +#endif + +#ifndef BLIS_DEFAULT_MR_C +#define BLIS_DEFAULT_MR_C 4 +#endif + +#ifndef BLIS_DEFAULT_MR_Z +#define BLIS_DEFAULT_MR_Z 2 +#endif + +// Define NR + +#ifndef BLIS_DEFAULT_NR_S +#define BLIS_DEFAULT_NR_S 4 +#endif + +#ifndef BLIS_DEFAULT_NR_D +#define BLIS_DEFAULT_NR_D 4 +#endif + +#ifndef BLIS_DEFAULT_NR_C +#define BLIS_DEFAULT_NR_C 2 +#endif + +#ifndef BLIS_DEFAULT_NR_Z +#define BLIS_DEFAULT_NR_Z 2 +#endif + +// Define KR + +#ifndef BLIS_DEFAULT_KR_S +#define BLIS_DEFAULT_KR_S 1 +#endif + +#ifndef BLIS_DEFAULT_KR_D +#define BLIS_DEFAULT_KR_D 1 +#endif + +#ifndef BLIS_DEFAULT_KR_C +#define BLIS_DEFAULT_KR_C 1 +#endif + +#ifndef BLIS_DEFAULT_KR_Z +#define BLIS_DEFAULT_KR_Z 1 +#endif + +// Define MR extension + +#ifndef BLIS_EXTEND_MR_S +#define BLIS_EXTEND_MR_S 0 +#endif + +#ifndef BLIS_EXTEND_MR_D +#define BLIS_EXTEND_MR_D 0 +#endif + +#ifndef BLIS_EXTEND_MR_C +#define BLIS_EXTEND_MR_C 0 +#endif + +#ifndef BLIS_EXTEND_MR_Z +#define BLIS_EXTEND_MR_Z 0 +#endif + +// Define NR extension + +#ifndef BLIS_EXTEND_NR_S +#define BLIS_EXTEND_NR_S 0 +#endif + +#ifndef BLIS_EXTEND_NR_D +#define BLIS_EXTEND_NR_D 0 +#endif + +#ifndef BLIS_EXTEND_NR_C +#define BLIS_EXTEND_NR_C 0 +#endif + +#ifndef BLIS_EXTEND_NR_Z +#define BLIS_EXTEND_NR_Z 0 +#endif + + + +// +// Define level-2 blocksizes. +// + +// NOTE: These values determine high-level cache blocking for level-2 +// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and +// MC = NC = 1000, then a total of four unblocked (or unblocked fused) +// gemv subproblems are called. The blocked algorithms are only useful in +// that they provide the opportunity for packing vectors. (Matrices can also +// be packed here, but this tends to be much too expensive in practice to +// actually employ.) + +#ifndef BLIS_DEFAULT_L2_MC_S +#define BLIS_DEFAULT_L2_MC_S 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_NC_S +#define BLIS_DEFAULT_L2_NC_S 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_MC_D +#define BLIS_DEFAULT_L2_MC_D 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_NC_D +#define BLIS_DEFAULT_L2_NC_D 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_MC_C +#define BLIS_DEFAULT_L2_MC_C 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_NC_C +#define BLIS_DEFAULT_L2_NC_C 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_MC_Z +#define BLIS_DEFAULT_L2_MC_Z 1000 +#endif + +#ifndef BLIS_DEFAULT_L2_NC_Z +#define BLIS_DEFAULT_L2_NC_Z 1000 +#endif + +// +// Define level-1f fusing factors. +// + +// Global level-1f fusing factors. + +#ifndef BLIS_L1F_FUSE_FAC_S +#define BLIS_L1F_FUSE_FAC_S 8 +#endif + +#ifndef BLIS_L1F_FUSE_FAC_D +#define BLIS_L1F_FUSE_FAC_D 4 +#endif + +#ifndef BLIS_L1F_FUSE_FAC_C +#define BLIS_L1F_FUSE_FAC_C 4 +#endif + +#ifndef BLIS_L1F_FUSE_FAC_Z +#define BLIS_L1F_FUSE_FAC_Z 2 +#endif + +// axpyf + +#ifndef BLIS_AXPYF_FUSE_FAC_S +#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#endif + +#ifndef BLIS_AXPYF_FUSE_FAC_D +#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#endif + +#ifndef BLIS_AXPYF_FUSE_FAC_C +#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#endif + +#ifndef BLIS_AXPYF_FUSE_FAC_Z +#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z +#endif + +// dotxf + +#ifndef BLIS_DOTXF_FUSE_FAC_S +#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#endif + +#ifndef BLIS_DOTXF_FUSE_FAC_D +#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#endif + +#ifndef BLIS_DOTXF_FUSE_FAC_C +#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#endif + +#ifndef BLIS_DOTXF_FUSE_FAC_Z +#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z +#endif + +// dotxaxpyf + +#ifndef BLIS_DOTXAXPYF_FUSE_FAC_S +#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S +#endif + +#ifndef BLIS_DOTXAXPYF_FUSE_FAC_D +#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D +#endif + +#ifndef BLIS_DOTXAXPYF_FUSE_FAC_C +#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C +#endif + +#ifndef BLIS_DOTXAXPYF_FUSE_FAC_Z +#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z +#endif + +// +// Define level-1v blocksizes. +// + +// NOTE: Register blocksizes for vectors are used when packing +// non-contiguous vectors. Similar to that of KR, they can +// typically be set to 1. + +#ifndef BLIS_DEFAULT_VR_S +#define BLIS_DEFAULT_VR_S 1 +#endif + +#ifndef BLIS_DEFAULT_VR_D +#define BLIS_DEFAULT_VR_D 1 +#endif + +#ifndef BLIS_DEFAULT_VR_C +#define BLIS_DEFAULT_VR_C 1 +#endif + +#ifndef BLIS_DEFAULT_VR_Z +#define BLIS_DEFAULT_VR_Z 1 +#endif + + + +// -- Kernel blocksize checks -------------------------------------------------- // Verify that cache blocksizes are whole multiples of register blocksizes. // Specifically, verify that: @@ -156,6 +1169,7 @@ // // MC must be a whole multiple of MR and NR. // + #if ( \ ( BLIS_DEFAULT_MC_S % BLIS_DEFAULT_MR_S != 0 ) || \ ( BLIS_DEFAULT_MC_D % BLIS_DEFAULT_MR_D != 0 ) || \ @@ -177,6 +1191,7 @@ // // NC must be a whole multiple of NR and MR. // + #if ( \ ( BLIS_DEFAULT_NC_S % BLIS_DEFAULT_NR_S != 0 ) || \ ( BLIS_DEFAULT_NC_D % BLIS_DEFAULT_NR_D != 0 ) || \ @@ -198,6 +1213,7 @@ // // KC must be a whole multiple of KR, MR, and NR. // + #if ( \ ( BLIS_DEFAULT_KC_S % BLIS_DEFAULT_KR_S != 0 ) || \ ( BLIS_DEFAULT_KC_D % BLIS_DEFAULT_KR_D != 0 ) || \ @@ -226,7 +1242,12 @@ #endif -// -- Compute maximum cache blocksizes ----------------------------------------- + +// -- Compute extended blocksizes ---------------------------------------------- + +// +// Compute maximum cache blocksizes. +// #define BLIS_MAXIMUM_MC_S ( BLIS_DEFAULT_MC_S + BLIS_EXTEND_MC_S ) #define BLIS_MAXIMUM_KC_S ( BLIS_DEFAULT_KC_S + BLIS_EXTEND_KC_S ) @@ -244,32 +1265,34 @@ #define BLIS_MAXIMUM_KC_Z ( BLIS_DEFAULT_KC_Z + BLIS_EXTEND_KC_Z ) #define BLIS_MAXIMUM_NC_Z ( BLIS_DEFAULT_NC_Z + BLIS_EXTEND_NC_Z ) - -// -- Compute leading dim blocksizes used for packing -------------------------- +// +// Compute leading dimension blocksizes used when packing micro-panels. +// #define BLIS_PACKDIM_MR_S ( BLIS_DEFAULT_MR_S + BLIS_EXTEND_MR_S ) -#define BLIS_PACKDIM_KR_S ( BLIS_DEFAULT_KR_S + BLIS_EXTEND_KR_S ) +#define BLIS_PACKDIM_KR_S ( BLIS_DEFAULT_KR_S ) #define BLIS_PACKDIM_NR_S ( BLIS_DEFAULT_NR_S + BLIS_EXTEND_NR_S ) #define BLIS_PACKDIM_MR_D ( BLIS_DEFAULT_MR_D + BLIS_EXTEND_MR_D ) -#define BLIS_PACKDIM_KR_D ( BLIS_DEFAULT_KR_D + BLIS_EXTEND_KR_D ) +#define BLIS_PACKDIM_KR_D ( BLIS_DEFAULT_KR_D ) #define BLIS_PACKDIM_NR_D ( BLIS_DEFAULT_NR_D + BLIS_EXTEND_NR_D ) #define BLIS_PACKDIM_MR_C ( BLIS_DEFAULT_MR_C + BLIS_EXTEND_MR_C ) -#define BLIS_PACKDIM_KR_C ( BLIS_DEFAULT_KR_C + BLIS_EXTEND_KR_C ) +#define BLIS_PACKDIM_KR_C ( BLIS_DEFAULT_KR_C ) #define BLIS_PACKDIM_NR_C ( BLIS_DEFAULT_NR_C + BLIS_EXTEND_NR_C ) #define BLIS_PACKDIM_MR_Z ( BLIS_DEFAULT_MR_Z + BLIS_EXTEND_MR_Z ) -#define BLIS_PACKDIM_KR_Z ( BLIS_DEFAULT_KR_Z + BLIS_EXTEND_KR_Z ) +#define BLIS_PACKDIM_KR_Z ( BLIS_DEFAULT_KR_Z ) #define BLIS_PACKDIM_NR_Z ( BLIS_DEFAULT_NR_Z + BLIS_EXTEND_NR_Z ) + // -- Abbreiviated kernel blocksize macros ------------------------------------- // Here, we shorten the blocksizes defined in bli_kernel.h so that they can // derived via the PASTEMAC macro. -// Default cache blocksizes +// Default (minimum) cache blocksizes #define bli_smc BLIS_DEFAULT_MC_S #define bli_skc BLIS_DEFAULT_KC_S @@ -287,24 +1310,6 @@ #define bli_zkc BLIS_DEFAULT_KC_Z #define bli_znc BLIS_DEFAULT_NC_Z -// Maximum cache blocksizes - -#define bli_smaxmc BLIS_MAXIMUM_MC_S -#define bli_smaxkc BLIS_MAXIMUM_KC_S -#define bli_smaxnc BLIS_MAXIMUM_NC_S - -#define bli_dmaxmc BLIS_MAXIMUM_MC_D -#define bli_dmaxkc BLIS_MAXIMUM_KC_D -#define bli_dmaxnc BLIS_MAXIMUM_NC_D - -#define bli_cmaxmc BLIS_MAXIMUM_MC_C -#define bli_cmaxkc BLIS_MAXIMUM_KC_C -#define bli_cmaxnc BLIS_MAXIMUM_NC_C - -#define bli_zmaxmc BLIS_MAXIMUM_MC_Z -#define bli_zmaxkc BLIS_MAXIMUM_KC_Z -#define bli_zmaxnc BLIS_MAXIMUM_NC_Z - // Register blocksizes #define bli_smr BLIS_DEFAULT_MR_S @@ -323,7 +1328,25 @@ #define bli_zkr BLIS_DEFAULT_KR_Z #define bli_znr BLIS_DEFAULT_NR_Z -// Micro-panel packing register blocksizes +// Extended (maximum) cache blocksizes + +#define bli_smaxmc BLIS_MAXIMUM_MC_S +#define bli_smaxkc BLIS_MAXIMUM_KC_S +#define bli_smaxnc BLIS_MAXIMUM_NC_S + +#define bli_dmaxmc BLIS_MAXIMUM_MC_D +#define bli_dmaxkc BLIS_MAXIMUM_KC_D +#define bli_dmaxnc BLIS_MAXIMUM_NC_D + +#define bli_cmaxmc BLIS_MAXIMUM_MC_C +#define bli_cmaxkc BLIS_MAXIMUM_KC_C +#define bli_cmaxnc BLIS_MAXIMUM_NC_C + +#define bli_zmaxmc BLIS_MAXIMUM_MC_Z +#define bli_zmaxkc BLIS_MAXIMUM_KC_Z +#define bli_zmaxnc BLIS_MAXIMUM_NC_Z + +// Extended (packing) register blocksizes #define bli_spackmr BLIS_PACKDIM_MR_S #define bli_spackkr BLIS_PACKDIM_KR_S @@ -341,7 +1364,7 @@ #define bli_zpackkr BLIS_PACKDIM_KR_Z #define bli_zpacknr BLIS_PACKDIM_NR_Z -// Default Level-1f fusing factors +// Level-1f fusing factors #define bli_saxpyf_fusefac BLIS_AXPYF_FUSE_FAC_S #define bli_daxpyf_fusefac BLIS_AXPYF_FUSE_FAC_D diff --git a/frame/include/bli_kernel_post_macro_defs.h b/frame/include/bli_kernel_post_macro_defs.h index b807359f8..6818c32b1 100644 --- a/frame/include/bli_kernel_post_macro_defs.h +++ b/frame/include/bli_kernel_post_macro_defs.h @@ -35,6 +35,238 @@ #ifndef BLIS_KERNEL_POST_MACRO_DEFS_H #define BLIS_KERNEL_POST_MACRO_DEFS_H +/* +// -- Define PASTEMAC-friendly kernel function name macros --------------------- + +// +// Level-3 +// + +// gemm micro-kernels + +#define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL +#define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL +#define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL +#define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL + +// gemmtrsm_l micro-kernels + +#define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL +#define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL +#define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL +#define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL + +// gemmtrsm_u micro-kernels + +#define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL +#define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL +#define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL +#define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL + +// trsm_l micro-kernels + +#define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL +#define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL +#define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL +#define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL + +// trsm_u micro-kernels + +#define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL +#define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL +#define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL +#define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL + +// +// Level-3 4m +// + +// gemm4m micro-kernels + +#define bli_cGEMM4M_UKERNEL BLIS_CGEMM4M_UKERNEL +#define bli_zGEMM4M_UKERNEL BLIS_ZGEMM4M_UKERNEL + +// gemmtrsm4m_l micro-kernels + +#define bli_cGEMMTRSM4M_L_UKERNEL BLIS_CGEMMTRSM4M_L_UKERNEL +#define bli_zGEMMTRSM4M_L_UKERNEL BLIS_ZGEMMTRSM4M_L_UKERNEL + +// gemmtrsm4m_u micro-kernels + +#define bli_cGEMMTRSM4M_U_UKERNEL BLIS_CGEMMTRSM4M_U_UKERNEL +#define bli_zGEMMTRSM4M_U_UKERNEL BLIS_ZGEMMTRSM4M_U_UKERNEL + +// trsm4m_l micro-kernels + +#define bli_cTRSM4M_L_UKERNEL BLIS_CTRSM4M_L_UKERNEL +#define bli_zTRSM4M_L_UKERNEL BLIS_ZTRSM4M_L_UKERNEL + +// trsm4m_u micro-kernels + +#define bli_cTRSM4M_U_UKERNEL BLIS_CTRSM4M_U_UKERNEL +#define bli_zTRSM4M_U_UKERNEL BLIS_ZTRSM4M_U_UKERNEL + +// +// Level-3 3m +// + +// gemm3m micro-kernels + +#define bli_cGEMM3M_UKERNEL BLIS_CGEMM3M_UKERNEL +#define bli_zGEMM3M_UKERNEL BLIS_ZGEMM3M_UKERNEL + +// gemmtrsm3m_l micro-kernels + +#define bli_cGEMMTRSM3M_L_UKERNEL BLIS_CGEMMTRSM3M_L_UKERNEL +#define bli_zGEMMTRSM3M_L_UKERNEL BLIS_ZGEMMTRSM3M_L_UKERNEL + +// gemmtrsm3m_u micro-kernels + +#define bli_cGEMMTRSM3M_U_UKERNEL BLIS_CGEMMTRSM3M_U_UKERNEL +#define bli_zGEMMTRSM3M_U_UKERNEL BLIS_ZGEMMTRSM3M_U_UKERNEL + +// trsm3m_l micro-kernels + +#define bli_cTRSM3M_L_UKERNEL BLIS_CTRSM3M_L_UKERNEL +#define bli_zTRSM3M_L_UKERNEL BLIS_ZTRSM3M_L_UKERNEL + +// trsm3m_u micro-kernels + +#define bli_cTRSM3M_U_UKERNEL BLIS_CTRSM3M_U_UKERNEL +#define bli_zTRSM3M_U_UKERNEL BLIS_ZTRSM3M_U_UKERNEL + +// +// Level-1m +// + +// NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel +// macros because they are used directly in the initialization of the +// function pointer array, rather than via a templatizing wrapper macro. + + +// +// Level-1f +// + +// axpy2v kernels + +#define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL +#define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL +#define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL +#define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL + +// dotaxpyv kernels + +#define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL +#define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL +#define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL +#define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL + +// axpyf kernels + +#define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL +#define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL +#define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL +#define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL + +// dotxf kernels + +#define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL +#define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL +#define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL +#define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL + +// dotxaxpyf kernels + +#define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL +#define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL +#define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL +#define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL + + +// +// Level-1v +// + +// addv kernels + +#define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL +#define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL +#define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL +#define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL + +// axpyv kernels + +#define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL +#define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL +#define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL +#define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL + +// copyv kernels + +#define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL +#define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL +#define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL +#define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL + +// dotv kernels + +#define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL +#define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL +#define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL +#define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL + +// dotxv kernels + +#define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL +#define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL +#define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL +#define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL + +// invertv kernels + +#define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL +#define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL +#define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL +#define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL + +// scal2v kernels + +#define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL +#define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL +#define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL +#define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL + +// scalv kernels + +#define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL +#define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL +#define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL +#define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL + +// setv kernels + +#define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL +#define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL +#define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL +#define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL + +// subv kernels + +#define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL +#define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL +#define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL +#define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL + +// swapv kernels + +#define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL +#define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL +#define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL +#define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL +*/ + + // -- Maximum register blocksize search ---------------------------------------- // @@ -43,13 +275,47 @@ #define BLIS_MAX_DEFAULT_MR_S BLIS_DEFAULT_MR_S #define BLIS_MAX_DEFAULT_MR_D BLIS_DEFAULT_MR_D + +// NOTE: 4m and 3m register blocksizes are assumed to be equal. Thus, +// we only inspect the 4m values. + +// c: Choose between the regular and 4m/3m blocksize. #define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_MR_C +#if BLIS_DEFAULT_4M_MR_C > BLIS_MAX_DEFAULT_MR_C +#undef BLIS_MAX_DEFAULT_MR_C +#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_4M_MR_C +#endif + +// z: Choose between the regular and 4m/3m blocksize. #define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_MR_Z +#if BLIS_DEFAULT_4M_MR_Z > BLIS_MAX_DEFAULT_MR_Z +#undef BLIS_MAX_DEFAULT_MR_Z +#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_4M_MR_Z +#endif + +// +// Find the largest register blocksize NR. +// #define BLIS_MAX_DEFAULT_NR_S BLIS_DEFAULT_NR_S #define BLIS_MAX_DEFAULT_NR_D BLIS_DEFAULT_NR_D + +// NOTE: 4m and 3m register blocksizes are assumed to be equal. Thus, +// we only inspect the 4m values. + +// c: Choose between the regular and 4m/3m blocksize. #define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_NR_C +#if BLIS_DEFAULT_4M_NR_C > BLIS_MAX_DEFAULT_NR_C +#undef BLIS_MAX_DEFAULT_NR_C +#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_4M_NR_C +#endif + +// z: Choose between the regular and 4m/3m blocksize. #define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_NR_Z +#if BLIS_DEFAULT_4M_NR_Z > BLIS_MAX_DEFAULT_NR_Z +#undef BLIS_MAX_DEFAULT_NR_Z +#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_4M_NR_Z +#endif // -- Abbreiviated macros ------------------------------------------------------ diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h new file mode 100644 index 000000000..d9974cfde --- /dev/null +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -0,0 +1,492 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_PRE_MACRO_DEFS_H +#define BLIS_KERNEL_PRE_MACRO_DEFS_H + +// -- Reference kernel definitions --------------------------------------------- + +// +// Level-3 +// + +// gemm micro-kernels + +#define BLIS_SGEMM_UKERNEL_REF bli_sgemm_ukr_ref +#define BLIS_DGEMM_UKERNEL_REF bli_dgemm_ukr_ref +#define BLIS_CGEMM_UKERNEL_REF bli_cgemm_ukr_ref +#define BLIS_ZGEMM_UKERNEL_REF bli_zgemm_ukr_ref + +// gemmtrsm_l micro-kernels + +#define BLIS_SGEMMTRSM_L_UKERNEL_REF bli_sgemmtrsm_l_ukr_ref +#define BLIS_DGEMMTRSM_L_UKERNEL_REF bli_dgemmtrsm_l_ukr_ref +#define BLIS_CGEMMTRSM_L_UKERNEL_REF bli_cgemmtrsm_l_ukr_ref +#define BLIS_ZGEMMTRSM_L_UKERNEL_REF bli_zgemmtrsm_l_ukr_ref + +// gemmtrsm_u micro-kernels + +#define BLIS_SGEMMTRSM_U_UKERNEL_REF bli_sgemmtrsm_u_ukr_ref +#define BLIS_DGEMMTRSM_U_UKERNEL_REF bli_dgemmtrsm_u_ukr_ref +#define BLIS_CGEMMTRSM_U_UKERNEL_REF bli_cgemmtrsm_u_ukr_ref +#define BLIS_ZGEMMTRSM_U_UKERNEL_REF bli_zgemmtrsm_u_ukr_ref + +// trsm_l micro-kernels + +#define BLIS_STRSM_L_UKERNEL_REF bli_strsm_l_ukr_ref +#define BLIS_DTRSM_L_UKERNEL_REF bli_dtrsm_l_ukr_ref +#define BLIS_CTRSM_L_UKERNEL_REF bli_ctrsm_l_ukr_ref +#define BLIS_ZTRSM_L_UKERNEL_REF bli_ztrsm_l_ukr_ref + +// trsm_u micro-kernels + +#define BLIS_STRSM_U_UKERNEL_REF bli_strsm_u_ukr_ref +#define BLIS_DTRSM_U_UKERNEL_REF bli_dtrsm_u_ukr_ref +#define BLIS_CTRSM_U_UKERNEL_REF bli_ctrsm_u_ukr_ref +#define BLIS_ZTRSM_U_UKERNEL_REF bli_ztrsm_u_ukr_ref + +// +// Level-3 4m +// + +// gemm4m micro-kernels + +#define BLIS_CGEMM4M_UKERNEL_REF bli_cgemm4m_ukr_ref +#define BLIS_ZGEMM4M_UKERNEL_REF bli_zgemm4m_ukr_ref + +// gemmtrsm4m_l micro-kernels + +#define BLIS_CGEMMTRSM4M_L_UKERNEL_REF bli_cgemmtrsm4m_l_ukr_ref +#define BLIS_ZGEMMTRSM4M_L_UKERNEL_REF bli_zgemmtrsm4m_l_ukr_ref + +// gemmtrsm4m_u micro-kernels + +#define BLIS_CGEMMTRSM4M_U_UKERNEL_REF bli_cgemmtrsm4m_u_ukr_ref +#define BLIS_ZGEMMTRSM4M_U_UKERNEL_REF bli_zgemmtrsm4m_u_ukr_ref + +// trsm4m_l micro-kernels + +#define BLIS_CTRSM4M_L_UKERNEL_REF bli_ctrsm4m_l_ukr_ref +#define BLIS_ZTRSM4M_L_UKERNEL_REF bli_ztrsm4m_l_ukr_ref + +// trsm4m_u micro-kernels + +#define BLIS_CTRSM4M_U_UKERNEL_REF bli_ctrsm4m_u_ukr_ref +#define BLIS_ZTRSM4M_U_UKERNEL_REF bli_ztrsm4m_u_ukr_ref + +// +// Level-3 3m +// + +// gemm3m micro-kernels + +#define BLIS_CGEMM3M_UKERNEL_REF bli_cgemm3m_ukr_ref +#define BLIS_ZGEMM3M_UKERNEL_REF bli_zgemm3m_ukr_ref + +// gemmtrsm3m_l micro-kernels + +#define BLIS_CGEMMTRSM3M_L_UKERNEL_REF bli_cgemmtrsm3m_l_ukr_ref +#define BLIS_ZGEMMTRSM3M_L_UKERNEL_REF bli_zgemmtrsm3m_l_ukr_ref + +// gemmtrsm3m_u micro-kernels + +#define BLIS_CGEMMTRSM3M_U_UKERNEL_REF bli_cgemmtrsm3m_u_ukr_ref +#define BLIS_ZGEMMTRSM3M_U_UKERNEL_REF bli_zgemmtrsm3m_u_ukr_ref + +// trsm3m_l micro-kernels + +#define BLIS_CTRSM3M_L_UKERNEL_REF bli_ctrsm3m_l_ukr_ref +#define BLIS_ZTRSM3M_L_UKERNEL_REF bli_ztrsm3m_l_ukr_ref + +// trsm3m_u micro-kernels + +#define BLIS_CTRSM3M_U_UKERNEL_REF bli_ctrsm3m_u_ukr_ref +#define BLIS_ZTRSM3M_U_UKERNEL_REF bli_ztrsm3m_u_ukr_ref + +// +// Level-1m +// + +// packm_2xk kernels + +#define BLIS_SPACKM_2XK_KERNEL_REF bli_spackm_ref_2xk +#define BLIS_DPACKM_2XK_KERNEL_REF bli_dpackm_ref_2xk +#define BLIS_CPACKM_2XK_KERNEL_REF bli_cpackm_ref_2xk +#define BLIS_ZPACKM_2XK_KERNEL_REF bli_zpackm_ref_2xk + +// packm_4xk kernels + +#define BLIS_SPACKM_4XK_KERNEL_REF bli_spackm_ref_4xk +#define BLIS_DPACKM_4XK_KERNEL_REF bli_dpackm_ref_4xk +#define BLIS_CPACKM_4XK_KERNEL_REF bli_cpackm_ref_4xk +#define BLIS_ZPACKM_4XK_KERNEL_REF bli_zpackm_ref_4xk + +// packm_6xk kernels + +#define BLIS_SPACKM_6XK_KERNEL_REF bli_spackm_ref_6xk +#define BLIS_DPACKM_6XK_KERNEL_REF bli_dpackm_ref_6xk +#define BLIS_CPACKM_6XK_KERNEL_REF bli_cpackm_ref_6xk +#define BLIS_ZPACKM_6XK_KERNEL_REF bli_zpackm_ref_6xk + +// packm_8xk kernels + +#define BLIS_SPACKM_8XK_KERNEL_REF bli_spackm_ref_8xk +#define BLIS_DPACKM_8XK_KERNEL_REF bli_dpackm_ref_8xk +#define BLIS_CPACKM_8XK_KERNEL_REF bli_cpackm_ref_8xk +#define BLIS_ZPACKM_8XK_KERNEL_REF bli_zpackm_ref_8xk + +// packm_10xk kernels + +#define BLIS_SPACKM_10XK_KERNEL_REF bli_spackm_ref_10xk +#define BLIS_DPACKM_10XK_KERNEL_REF bli_dpackm_ref_10xk +#define BLIS_CPACKM_10XK_KERNEL_REF bli_cpackm_ref_10xk +#define BLIS_ZPACKM_10XK_KERNEL_REF bli_zpackm_ref_10xk + +// packm_12xk kernels + +#define BLIS_SPACKM_12XK_KERNEL_REF bli_spackm_ref_12xk +#define BLIS_DPACKM_12XK_KERNEL_REF bli_dpackm_ref_12xk +#define BLIS_CPACKM_12XK_KERNEL_REF bli_cpackm_ref_12xk +#define BLIS_ZPACKM_12XK_KERNEL_REF bli_zpackm_ref_12xk + +// packm_14xk kernels + +#define BLIS_SPACKM_14XK_KERNEL_REF bli_spackm_ref_14xk +#define BLIS_DPACKM_14XK_KERNEL_REF bli_dpackm_ref_14xk +#define BLIS_CPACKM_14XK_KERNEL_REF bli_cpackm_ref_14xk +#define BLIS_ZPACKM_14XK_KERNEL_REF bli_zpackm_ref_14xk + +// packm_16xk kernels + +#define BLIS_SPACKM_16XK_KERNEL_REF bli_spackm_ref_16xk +#define BLIS_DPACKM_16XK_KERNEL_REF bli_dpackm_ref_16xk +#define BLIS_CPACKM_16XK_KERNEL_REF bli_cpackm_ref_16xk +#define BLIS_ZPACKM_16XK_KERNEL_REF bli_zpackm_ref_16xk + +// packm_2xk_ri kernels + +#define BLIS_SPACKM_2XK_RI_KERNEL_REF bli_spackm_ref_2xk_ri +#define BLIS_DPACKM_2XK_RI_KERNEL_REF bli_dpackm_ref_2xk_ri +#define BLIS_CPACKM_2XK_RI_KERNEL_REF bli_cpackm_ref_2xk_ri +#define BLIS_ZPACKM_2XK_RI_KERNEL_REF bli_zpackm_ref_2xk_ri + +// packm_4xk_ri kernels + +#define BLIS_SPACKM_4XK_RI_KERNEL_REF bli_spackm_ref_4xk_ri +#define BLIS_DPACKM_4XK_RI_KERNEL_REF bli_dpackm_ref_4xk_ri +#define BLIS_CPACKM_4XK_RI_KERNEL_REF bli_cpackm_ref_4xk_ri +#define BLIS_ZPACKM_4XK_RI_KERNEL_REF bli_zpackm_ref_4xk_ri + +// packm_6xk_ri kernels + +#define BLIS_SPACKM_6XK_RI_KERNEL_REF bli_spackm_ref_6xk_ri +#define BLIS_DPACKM_6XK_RI_KERNEL_REF bli_dpackm_ref_6xk_ri +#define BLIS_CPACKM_6XK_RI_KERNEL_REF bli_cpackm_ref_6xk_ri +#define BLIS_ZPACKM_6XK_RI_KERNEL_REF bli_zpackm_ref_6xk_ri + +// packm_8xk_ri kernels + +#define BLIS_SPACKM_8XK_RI_KERNEL_REF bli_spackm_ref_8xk_ri +#define BLIS_DPACKM_8XK_RI_KERNEL_REF bli_dpackm_ref_8xk_ri +#define BLIS_CPACKM_8XK_RI_KERNEL_REF bli_cpackm_ref_8xk_ri +#define BLIS_ZPACKM_8XK_RI_KERNEL_REF bli_zpackm_ref_8xk_ri + +// packm_10xk_ri kernels + +#define BLIS_SPACKM_10XK_RI_KERNEL_REF bli_spackm_ref_10xk_ri +#define BLIS_DPACKM_10XK_RI_KERNEL_REF bli_dpackm_ref_10xk_ri +#define BLIS_CPACKM_10XK_RI_KERNEL_REF bli_cpackm_ref_10xk_ri +#define BLIS_ZPACKM_10XK_RI_KERNEL_REF bli_zpackm_ref_10xk_ri + +// packm_12xk_ri kernels + +#define BLIS_SPACKM_12XK_RI_KERNEL_REF bli_spackm_ref_12xk_ri +#define BLIS_DPACKM_12XK_RI_KERNEL_REF bli_dpackm_ref_12xk_ri +#define BLIS_CPACKM_12XK_RI_KERNEL_REF bli_cpackm_ref_12xk_ri +#define BLIS_ZPACKM_12XK_RI_KERNEL_REF bli_zpackm_ref_12xk_ri + +// packm_14xk_ri kernels + +#define BLIS_SPACKM_14XK_RI_KERNEL_REF bli_spackm_ref_14xk_ri +#define BLIS_DPACKM_14XK_RI_KERNEL_REF bli_dpackm_ref_14xk_ri +#define BLIS_CPACKM_14XK_RI_KERNEL_REF bli_cpackm_ref_14xk_ri +#define BLIS_ZPACKM_14XK_RI_KERNEL_REF bli_zpackm_ref_14xk_ri + +// packm_16xk_ri kernels + +#define BLIS_SPACKM_16XK_RI_KERNEL_REF bli_spackm_ref_16xk_ri +#define BLIS_DPACKM_16XK_RI_KERNEL_REF bli_dpackm_ref_16xk_ri +#define BLIS_CPACKM_16XK_RI_KERNEL_REF bli_cpackm_ref_16xk_ri +#define BLIS_ZPACKM_16XK_RI_KERNEL_REF bli_zpackm_ref_16xk_ri + +// packm_2xk_ri3 kernels + +#define BLIS_SPACKM_2XK_RI3_KERNEL_REF bli_spackm_ref_2xk_ri3 +#define BLIS_DPACKM_2XK_RI3_KERNEL_REF bli_dpackm_ref_2xk_ri3 +#define BLIS_CPACKM_2XK_RI3_KERNEL_REF bli_cpackm_ref_2xk_ri3 +#define BLIS_ZPACKM_2XK_RI3_KERNEL_REF bli_zpackm_ref_2xk_ri3 + +// packm_4xk_ri3 kernels + +#define BLIS_SPACKM_4XK_RI3_KERNEL_REF bli_spackm_ref_4xk_ri3 +#define BLIS_DPACKM_4XK_RI3_KERNEL_REF bli_dpackm_ref_4xk_ri3 +#define BLIS_CPACKM_4XK_RI3_KERNEL_REF bli_cpackm_ref_4xk_ri3 +#define BLIS_ZPACKM_4XK_RI3_KERNEL_REF bli_zpackm_ref_4xk_ri3 + +// packm_6xk_ri3 kernels + +#define BLIS_SPACKM_6XK_RI3_KERNEL_REF bli_spackm_ref_6xk_ri3 +#define BLIS_DPACKM_6XK_RI3_KERNEL_REF bli_dpackm_ref_6xk_ri3 +#define BLIS_CPACKM_6XK_RI3_KERNEL_REF bli_cpackm_ref_6xk_ri3 +#define BLIS_ZPACKM_6XK_RI3_KERNEL_REF bli_zpackm_ref_6xk_ri3 + +// packm_8xk_ri3 kernels + +#define BLIS_SPACKM_8XK_RI3_KERNEL_REF bli_spackm_ref_8xk_ri3 +#define BLIS_DPACKM_8XK_RI3_KERNEL_REF bli_dpackm_ref_8xk_ri3 +#define BLIS_CPACKM_8XK_RI3_KERNEL_REF bli_cpackm_ref_8xk_ri3 +#define BLIS_ZPACKM_8XK_RI3_KERNEL_REF bli_zpackm_ref_8xk_ri3 + +// packm_10xk_ri3 kernels + +#define BLIS_SPACKM_10XK_RI3_KERNEL_REF bli_spackm_ref_10xk_ri3 +#define BLIS_DPACKM_10XK_RI3_KERNEL_REF bli_dpackm_ref_10xk_ri3 +#define BLIS_CPACKM_10XK_RI3_KERNEL_REF bli_cpackm_ref_10xk_ri3 +#define BLIS_ZPACKM_10XK_RI3_KERNEL_REF bli_zpackm_ref_10xk_ri3 + +// packm_12xk_ri3 kernels + +#define BLIS_SPACKM_12XK_RI3_KERNEL_REF bli_spackm_ref_12xk_ri3 +#define BLIS_DPACKM_12XK_RI3_KERNEL_REF bli_dpackm_ref_12xk_ri3 +#define BLIS_CPACKM_12XK_RI3_KERNEL_REF bli_cpackm_ref_12xk_ri3 +#define BLIS_ZPACKM_12XK_RI3_KERNEL_REF bli_zpackm_ref_12xk_ri3 + +// packm_14xk_ri3 kernels + +#define BLIS_SPACKM_14XK_RI3_KERNEL_REF bli_spackm_ref_14xk_ri3 +#define BLIS_DPACKM_14XK_RI3_KERNEL_REF bli_dpackm_ref_14xk_ri3 +#define BLIS_CPACKM_14XK_RI3_KERNEL_REF bli_cpackm_ref_14xk_ri3 +#define BLIS_ZPACKM_14XK_RI3_KERNEL_REF bli_zpackm_ref_14xk_ri3 + +// packm_16xk_ri3 kernels + +#define BLIS_SPACKM_16XK_RI3_KERNEL_REF bli_spackm_ref_16xk_ri3 +#define BLIS_DPACKM_16XK_RI3_KERNEL_REF bli_dpackm_ref_16xk_ri3 +#define BLIS_CPACKM_16XK_RI3_KERNEL_REF bli_cpackm_ref_16xk_ri3 +#define BLIS_ZPACKM_16XK_RI3_KERNEL_REF bli_zpackm_ref_16xk_ri3 + +// unpack_2xk kernels + +#define BLIS_SUNPACKM_2XK_KERNEL_REF bli_sunpackm_ref_2xk +#define BLIS_DUNPACKM_2XK_KERNEL_REF bli_dunpackm_ref_2xk +#define BLIS_CUNPACKM_2XK_KERNEL_REF bli_cunpackm_ref_2xk +#define BLIS_ZUNPACKM_2XK_KERNEL_REF bli_zunpackm_ref_2xk + +// unpack_4xk kernels + +#define BLIS_SUNPACKM_4XK_KERNEL_REF bli_sunpackm_ref_4xk +#define BLIS_DUNPACKM_4XK_KERNEL_REF bli_dunpackm_ref_4xk +#define BLIS_CUNPACKM_4XK_KERNEL_REF bli_cunpackm_ref_4xk +#define BLIS_ZUNPACKM_4XK_KERNEL_REF bli_zunpackm_ref_4xk + +// unpack_6xk kernels + +#define BLIS_SUNPACKM_6XK_KERNEL_REF bli_sunpackm_ref_6xk +#define BLIS_DUNPACKM_6XK_KERNEL_REF bli_dunpackm_ref_6xk +#define BLIS_CUNPACKM_6XK_KERNEL_REF bli_cunpackm_ref_6xk +#define BLIS_ZUNPACKM_6XK_KERNEL_REF bli_zunpackm_ref_6xk + +// unpack_8xk kernels + +#define BLIS_SUNPACKM_8XK_KERNEL_REF bli_sunpackm_ref_8xk +#define BLIS_DUNPACKM_8XK_KERNEL_REF bli_dunpackm_ref_8xk +#define BLIS_CUNPACKM_8XK_KERNEL_REF bli_cunpackm_ref_8xk +#define BLIS_ZUNPACKM_8XK_KERNEL_REF bli_zunpackm_ref_8xk + +// unpack_10xk kernels + +#define BLIS_SUNPACKM_10XK_KERNEL_REF bli_sunpackm_ref_10xk +#define BLIS_DUNPACKM_10XK_KERNEL_REF bli_dunpackm_ref_10xk +#define BLIS_CUNPACKM_10XK_KERNEL_REF bli_cunpackm_ref_10xk +#define BLIS_ZUNPACKM_10XK_KERNEL_REF bli_zunpackm_ref_10xk + +// unpack_12xk kernels + +#define BLIS_SUNPACKM_12XK_KERNEL_REF bli_sunpackm_ref_12xk +#define BLIS_DUNPACKM_12XK_KERNEL_REF bli_dunpackm_ref_12xk +#define BLIS_CUNPACKM_12XK_KERNEL_REF bli_cunpackm_ref_12xk +#define BLIS_ZUNPACKM_12XK_KERNEL_REF bli_zunpackm_ref_12xk + +// unpack_14xk kernels + +#define BLIS_SUNPACKM_14XK_KERNEL_REF bli_sunpackm_ref_14xk +#define BLIS_DUNPACKM_14XK_KERNEL_REF bli_dunpackm_ref_14xk +#define BLIS_CUNPACKM_14XK_KERNEL_REF bli_cunpackm_ref_14xk +#define BLIS_ZUNPACKM_14XK_KERNEL_REF bli_zunpackm_ref_14xk + +// unpack_16xk kernels + +#define BLIS_SUNPACKM_16XK_KERNEL_REF bli_sunpackm_ref_16xk +#define BLIS_DUNPACKM_16XK_KERNEL_REF bli_dunpackm_ref_16xk +#define BLIS_CUNPACKM_16XK_KERNEL_REF bli_cunpackm_ref_16xk +#define BLIS_ZUNPACKM_16XK_KERNEL_REF bli_zunpackm_ref_16xk + +// +// Level-1f +// + +// axpy2v kernels + +#define BLIS_SAXPY2V_KERNEL_REF bli_sssaxpy2v_ref +#define BLIS_DAXPY2V_KERNEL_REF bli_dddaxpy2v_ref +#define BLIS_CAXPY2V_KERNEL_REF bli_cccaxpy2v_ref +#define BLIS_ZAXPY2V_KERNEL_REF bli_zzzaxpy2v_ref + +// dotaxpyv kernels + +#define BLIS_SDOTAXPYV_KERNEL_REF bli_sssdotaxpyv_ref +#define BLIS_DDOTAXPYV_KERNEL_REF bli_ddddotaxpyv_ref +#define BLIS_CDOTAXPYV_KERNEL_REF bli_cccdotaxpyv_ref +#define BLIS_ZDOTAXPYV_KERNEL_REF bli_zzzdotaxpyv_ref + +// axpyf kernels + +#define BLIS_SAXPYF_KERNEL_REF bli_sssaxpyf_ref +#define BLIS_DAXPYF_KERNEL_REF bli_dddaxpyf_ref +#define BLIS_CAXPYF_KERNEL_REF bli_cccaxpyf_ref +#define BLIS_ZAXPYF_KERNEL_REF bli_zzzaxpyf_ref + +// dotxf kernels + +#define BLIS_SDOTXF_KERNEL_REF bli_sssdotxf_ref +#define BLIS_DDOTXF_KERNEL_REF bli_ddddotxf_ref +#define BLIS_CDOTXF_KERNEL_REF bli_cccdotxf_ref +#define BLIS_ZDOTXF_KERNEL_REF bli_zzzdotxf_ref + +// dotxaxpyf kernels + +//#define BLIS_SDOTXAXPYF_KERNEL_REF bli_sssdotxaxpyf_ref_var1 +//#define BLIS_DDOTXAXPYF_KERNEL_REF bli_ddddotxaxpyf_ref_var1 +//#define BLIS_CDOTXAXPYF_KERNEL_REF bli_cccdotxaxpyf_ref_var1 +//#define BLIS_ZDOTXAXPYF_KERNEL_REF bli_zzzdotxaxpyf_ref_var1 +#define BLIS_SDOTXAXPYF_KERNEL_REF bli_sssdotxaxpyf_ref_var2 +#define BLIS_DDOTXAXPYF_KERNEL_REF bli_ddddotxaxpyf_ref_var2 +#define BLIS_CDOTXAXPYF_KERNEL_REF bli_cccdotxaxpyf_ref_var2 +#define BLIS_ZDOTXAXPYF_KERNEL_REF bli_zzzdotxaxpyf_ref_var2 + +// +// Level-1v +// + +// addv kernels + +#define BLIS_SADDV_KERNEL_REF bli_ssaddv_ref +#define BLIS_DADDV_KERNEL_REF bli_ddaddv_ref +#define BLIS_CADDV_KERNEL_REF bli_ccaddv_ref +#define BLIS_ZADDV_KERNEL_REF bli_zzaddv_ref + +// axpyv kernels + +#define BLIS_SAXPYV_KERNEL_REF bli_sssaxpyv_ref +#define BLIS_DAXPYV_KERNEL_REF bli_dddaxpyv_ref +#define BLIS_CAXPYV_KERNEL_REF bli_cccaxpyv_ref +#define BLIS_ZAXPYV_KERNEL_REF bli_zzzaxpyv_ref + +// copyv kernels + +#define BLIS_SCOPYV_KERNEL_REF bli_sscopyv_ref +#define BLIS_DCOPYV_KERNEL_REF bli_ddcopyv_ref +#define BLIS_CCOPYV_KERNEL_REF bli_cccopyv_ref +#define BLIS_ZCOPYV_KERNEL_REF bli_zzcopyv_ref + +// dotv kernels + +#define BLIS_SDOTV_KERNEL_REF bli_sssdotv_ref +#define BLIS_DDOTV_KERNEL_REF bli_ddddotv_ref +#define BLIS_CDOTV_KERNEL_REF bli_cccdotv_ref +#define BLIS_ZDOTV_KERNEL_REF bli_zzzdotv_ref + +// dotxv kernels + +#define BLIS_SDOTXV_KERNEL_REF bli_sssdotxv_ref +#define BLIS_DDOTXV_KERNEL_REF bli_ddddotxv_ref +#define BLIS_CDOTXV_KERNEL_REF bli_cccdotxv_ref +#define BLIS_ZDOTXV_KERNEL_REF bli_zzzdotxv_ref + +// invertv kernels + +#define BLIS_SINVERTV_KERNEL_REF bli_sinvertv_ref +#define BLIS_DINVERTV_KERNEL_REF bli_dinvertv_ref +#define BLIS_CINVERTV_KERNEL_REF bli_cinvertv_ref +#define BLIS_ZINVERTV_KERNEL_REF bli_zinvertv_ref + +// scal2v kernels + +#define BLIS_SSCAL2V_KERNEL_REF bli_sssscal2v_ref +#define BLIS_DSCAL2V_KERNEL_REF bli_dddscal2v_ref +#define BLIS_CSCAL2V_KERNEL_REF bli_cccscal2v_ref +#define BLIS_ZSCAL2V_KERNEL_REF bli_zzzscal2v_ref + +// scalv kernels + +#define BLIS_SSCALV_KERNEL_REF bli_ssscalv_ref +#define BLIS_DSCALV_KERNEL_REF bli_ddscalv_ref +#define BLIS_CSCALV_KERNEL_REF bli_ccscalv_ref +#define BLIS_ZSCALV_KERNEL_REF bli_zzscalv_ref + +// setv kernels + +#define BLIS_SSETV_KERNEL_REF bli_sssetv_ref +#define BLIS_DSETV_KERNEL_REF bli_ddsetv_ref +#define BLIS_CSETV_KERNEL_REF bli_ccsetv_ref +#define BLIS_ZSETV_KERNEL_REF bli_zzsetv_ref + +// subv kernels + +#define BLIS_SSUBV_KERNEL_REF bli_sssubv_ref +#define BLIS_DSUBV_KERNEL_REF bli_ddsubv_ref +#define BLIS_CSUBV_KERNEL_REF bli_ccsubv_ref +#define BLIS_ZSUBV_KERNEL_REF bli_zzsubv_ref + +// swapv kernels + +#define BLIS_SSWAPV_KERNEL_REF bli_ssswapv_ref +#define BLIS_DSWAPV_KERNEL_REF bli_ddswapv_ref +#define BLIS_CSWAPV_KERNEL_REF bli_ccswapv_ref +#define BLIS_ZSWAPV_KERNEL_REF bli_zzswapv_ref + + + +#endif + diff --git a/frame/include/bli_kernel_prototypes.h b/frame/include/bli_kernel_prototypes.h new file mode 100644 index 000000000..10849dc9b --- /dev/null +++ b/frame/include/bli_kernel_prototypes.h @@ -0,0 +1,741 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_KERNEL_PROTOTYPES_H +#define BLIS_KERNEL_PROTOTYPES_H + + +// -- Define PASTEMAC-friendly kernel function name macros --------------------- + +// +// Level-3 +// + +// gemm micro-kernels + +#define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL +#define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL +#define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL +#define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( GEMM_UKERNEL ) + +// gemmtrsm_l micro-kernels + +#define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL +#define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL +#define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL +#define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict b01, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( GEMMTRSM_L_UKERNEL ) + +// gemmtrsm_u micro-kernels + +#define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL +#define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL +#define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL +#define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict b21, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( GEMMTRSM_U_UKERNEL ) + +// trsm_l micro-kernels + +#define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL +#define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL +#define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL +#define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype* restrict a11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( TRSM_L_UKERNEL ) + +// trsm_u micro-kernels + +#define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL +#define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL +#define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL +#define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype* restrict a11, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROT_BASIC( TRSM_U_UKERNEL ) + + +// +// Level-3 4m +// + +// gemm4m micro-kernels + +#define bli_cGEMM4M_UKERNEL BLIS_CGEMM4M_UKERNEL +#define bli_zGEMM4M_UKERNEL BLIS_ZGEMM4M_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMM4M_UKERNEL ) + +// gemmtrsm4m_l micro-kernels + +#define bli_cGEMMTRSM4M_L_UKERNEL BLIS_CGEMMTRSM4M_L_UKERNEL +#define bli_zGEMMTRSM4M_L_UKERNEL BLIS_ZGEMMTRSM4M_L_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict b01, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMMTRSM4M_L_UKERNEL ) + +// gemmtrsm4m_u micro-kernels + +#define bli_cGEMMTRSM4M_U_UKERNEL BLIS_CGEMMTRSM4M_U_UKERNEL +#define bli_zGEMMTRSM4M_U_UKERNEL BLIS_ZGEMMTRSM4M_U_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict b21, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMMTRSM4M_U_UKERNEL ) + +// trsm4m_l micro-kernels + +#define bli_cTRSM4M_L_UKERNEL BLIS_CTRSM4M_L_UKERNEL +#define bli_zTRSM4M_L_UKERNEL BLIS_ZTRSM4M_L_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype_r* restrict a11r, \ + ctype_r* restrict b11r, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( TRSM4M_L_UKERNEL ) + +// trsm4m_u micro-kernels + +#define bli_cTRSM4M_U_UKERNEL BLIS_CTRSM4M_U_UKERNEL +#define bli_zTRSM4M_U_UKERNEL BLIS_ZTRSM4M_U_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype_r* restrict a11r, \ + ctype_r* restrict b11r, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( TRSM4M_U_UKERNEL ) + + +// +// Level-3 3m +// + +// gemm3m micro-kernels + +#define bli_cGEMM3M_UKERNEL BLIS_CGEMM3M_UKERNEL +#define bli_zGEMM3M_UKERNEL BLIS_ZGEMM3M_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMM3M_UKERNEL ) + +// gemmtrsm3m_l micro-kernels + +#define bli_cGEMMTRSM3M_L_UKERNEL BLIS_CGEMMTRSM3M_L_UKERNEL +#define bli_zGEMMTRSM3M_L_UKERNEL BLIS_ZGEMMTRSM3M_L_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a10, \ + ctype* restrict a11, \ + ctype* restrict b01, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMMTRSM3M_L_UKERNEL ) + +// gemmtrsm3m_u micro-kernels + +#define bli_cGEMMTRSM3M_U_UKERNEL BLIS_CGEMMTRSM3M_U_UKERNEL +#define bli_zGEMMTRSM3M_U_UKERNEL BLIS_ZGEMMTRSM3M_U_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a12, \ + ctype* restrict a11, \ + ctype* restrict b21, \ + ctype* restrict b11, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( GEMMTRSM3M_U_UKERNEL ) + +// trsm3m_l micro-kernels + +#define bli_cTRSM3M_L_UKERNEL BLIS_CTRSM3M_L_UKERNEL +#define bli_zTRSM3M_L_UKERNEL BLIS_ZTRSM3M_L_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype_r* restrict a11r, \ + ctype_r* restrict b11r, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( TRSM3M_L_UKERNEL ) + +// trsm3m_u micro-kernels + +#define bli_cTRSM3M_U_UKERNEL BLIS_CTRSM3M_U_UKERNEL +#define bli_zTRSM3M_U_UKERNEL BLIS_ZTRSM3M_U_UKERNEL + +#undef GENTPROTCO +#define GENTPROTCO( ctype, ctype_r, ch, chr, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + ctype_r* restrict a11r, \ + ctype_r* restrict b11r, \ + ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* data \ + ); + +INSERT_GENTPROTCO_BASIC( TRSM3M_U_UKERNEL ) + + +// +// Level-1m +// + +// NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel +// macros because they are used directly in the initialization of the +// function pointer array, rather than via a templatizing wrapper macro. + + +// +// Level-1f +// + +// axpy2v kernels + +#define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL +#define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL +#define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL +#define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ +\ +void PASTEMAC3(chx,chy,chz,kername) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha1, \ + ctype_xy* restrict alpha2, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_z* restrict z, inc_t incz \ + ); + +INSERT_GENTPROT3U12_BASIC( AXPY2V_KERNEL ) + +// dotaxpyv kernels + +#define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL +#define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL +#define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL +#define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ +\ +void PASTEMAC3(chx,chy,chz,kername) \ + ( \ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t m, \ + ctype_x* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_xy* restrict rho, \ + ctype_z* restrict z, inc_t incz \ + ); + +INSERT_GENTPROT3U12_BASIC( DOTAXPYV_KERNEL ) + +// axpyf kernels + +#define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL +#define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL +#define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL +#define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,kername) \ + ( \ + conj_t conja, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3U12_BASIC( AXPYF_KERNEL ) + +// dotxf kernels + +#define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL +#define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL +#define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL +#define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,kername) \ + ( \ + conj_t conjat, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ax* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict beta, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3U12_BASIC( DOTXF_KERNEL ) + +// dotxaxpyf kernels + +#define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL +#define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL +#define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL +#define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, kername ) \ +\ +void PASTEMAC3(cha,chb,chc,kername) \ + ( \ + conj_t conjat, \ + conj_t conja, \ + conj_t conjw, \ + conj_t conjx, \ + dim_t m, \ + dim_t b_n, \ + ctype_ab* restrict alpha, \ + ctype_a* restrict a, inc_t inca, inc_t lda, \ + ctype_b* restrict w, inc_t incw, \ + ctype_b* restrict x, inc_t incx, \ + ctype_c* restrict beta, \ + ctype_c* restrict y, inc_t incy, \ + ctype_c* restrict z, inc_t incz \ + ); + +INSERT_GENTPROT3U12_BASIC( DOTXAXPYF_KERNEL ) + + +// +// Level-1v +// + +// addv kernels + +#define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL +#define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL +#define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL +#define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ +\ +void PASTEMAC2(chx,chy,kername) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( ADDV_KERNEL ) + +// axpyv kernels + +#define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL +#define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL +#define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL +#define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL + +#undef GENTPROT3 +#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, kername ) \ +\ +void PASTEMAC3(cha,chx,chy,kername) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_a* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3_BASIC( AXPYV_KERNEL ) + +// copyv kernels + +#define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL +#define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL +#define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL +#define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ +\ +void PASTEMAC2(chx,chy,kername) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( COPYV_KERNEL ) + +// dotv kernels + +#define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL +#define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL +#define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL +#define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL + +#undef GENTPROT3 +#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, kername ) \ +\ +void PASTEMAC3(chx,chy,chr,kername) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict rho \ + ); + +INSERT_GENTPROT3_BASIC( DOTV_KERNEL ) + +// dotxv kernels + +#define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL +#define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL +#define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL +#define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL + +#undef GENTPROT3U12 +#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, kername ) \ +\ +void PASTEMAC3(chx,chy,chr,kername) \ + ( \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + ctype_xy* restrict alpha, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy, \ + ctype_r* restrict beta, \ + ctype_r* restrict rho \ + ); + +INSERT_GENTPROT3U12_BASIC( DOTXV_KERNEL ) + +// invertv kernels + +#define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL +#define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL +#define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL +#define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL + +#undef GENTPROT +#define GENTPROT( ctype, ch, kername ) \ +\ +void PASTEMAC(ch,kername) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT_BASIC( INVERTV_KERNEL ) + +// scal2v kernels + +#define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL +#define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL +#define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL +#define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL + +#undef GENTPROT3 +#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, kername ) \ +\ +void PASTEMAC3(chb,chx,chy,kername) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT3_BASIC( SCAL2V_KERNEL ) + +// scalv kernels + +#define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL +#define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL +#define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL +#define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ +\ +void PASTEMAC2(chb,chx,kername) \ + ( \ + conj_t conjbeta, \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT2_BASIC( SCALV_KERNEL ) + +// setv kernels + +#define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL +#define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL +#define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL +#define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ +\ +void PASTEMAC2(chb,chx,kername) \ + ( \ + dim_t n, \ + ctype_b* restrict beta, \ + ctype_x* restrict x, inc_t incx \ + ); + +INSERT_GENTPROT2_BASIC( SETV_KERNEL ) + +// subv kernels + +#define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL +#define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL +#define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL +#define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ +\ +void PASTEMAC2(chx,chy,kername) \ + ( \ + conj_t conjx, \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( SUBV_KERNEL ) + +// swapv kernels + +#define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL +#define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL +#define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL +#define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL + +#undef GENTPROT2 +#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ +\ +void PASTEMAC2(chx,chy,kername) \ + ( \ + dim_t n, \ + ctype_x* restrict x, inc_t incx, \ + ctype_y* restrict y, inc_t incy \ + ); + +INSERT_GENTPROT2_BASIC( SWAPV_KERNEL ) + + + +#endif + diff --git a/frame/include/bli_mem_pool_macro_defs.h b/frame/include/bli_mem_pool_macro_defs.h index 7f5b5ef0b..b3fe7a3c7 100644 --- a/frame/include/bli_mem_pool_macro_defs.h +++ b/frame/include/bli_mem_pool_macro_defs.h @@ -114,6 +114,7 @@ #define BLIS_DEFAULT_MAXR_Z BLIS_DEFAULT_NR_Z #endif + // Next, we define the dimensions of the pool blocks for each datatype. // @@ -156,6 +157,47 @@ #define BLIS_POOL_KC_Z ( ( BLIS_MAXIMUM_KC_Z * BLIS_PACKDIM_KR_Z ) \ / BLIS_DEFAULT_KR_Z ) +// +// Compute pool dimensions for single complex (4m) +// +#define BLIS_POOL_4M_MC_C ( ( BLIS_MAXIMUM_4M_MC_C * BLIS_PACKDIM_MAXR_S ) \ + / BLIS_DEFAULT_MAXR_S ) +#define BLIS_POOL_4M_NC_C ( ( BLIS_MAXIMUM_4M_NC_C * BLIS_PACKDIM_MAXR_S ) \ + / BLIS_DEFAULT_MAXR_S ) +#define BLIS_POOL_4M_KC_C ( ( BLIS_MAXIMUM_4M_KC_C * BLIS_PACKDIM_KR_S ) \ + / BLIS_DEFAULT_KR_S ) + +// +// Compute pool dimensions for double complex (4m) +// +#define BLIS_POOL_4M_MC_Z ( ( BLIS_MAXIMUM_4M_MC_Z * BLIS_PACKDIM_MAXR_D ) \ + / BLIS_DEFAULT_MAXR_D ) +#define BLIS_POOL_4M_NC_Z ( ( BLIS_MAXIMUM_4M_NC_Z * BLIS_PACKDIM_MAXR_D ) \ + / BLIS_DEFAULT_MAXR_D ) +#define BLIS_POOL_4M_KC_Z ( ( BLIS_MAXIMUM_4M_KC_Z * BLIS_PACKDIM_KR_D ) \ + / BLIS_DEFAULT_KR_D ) + +// +// Compute pool dimensions for single complex (3m) +// +#define BLIS_POOL_3M_MC_C ( ( BLIS_MAXIMUM_3M_MC_C * BLIS_PACKDIM_MAXR_S ) \ + / BLIS_DEFAULT_MAXR_S ) +#define BLIS_POOL_3M_NC_C ( ( BLIS_MAXIMUM_3M_NC_C * BLIS_PACKDIM_MAXR_S ) \ + / BLIS_DEFAULT_MAXR_S ) +#define BLIS_POOL_3M_KC_C ( ( BLIS_MAXIMUM_3M_KC_C * BLIS_PACKDIM_KR_S ) \ + / BLIS_DEFAULT_KR_S ) + +// +// Compute pool dimensions for double complex (3m) +// +#define BLIS_POOL_3M_MC_Z ( ( BLIS_MAXIMUM_3M_MC_Z * BLIS_PACKDIM_MAXR_D ) \ + / BLIS_DEFAULT_MAXR_D ) +#define BLIS_POOL_3M_NC_Z ( ( BLIS_MAXIMUM_3M_NC_Z * BLIS_PACKDIM_MAXR_D ) \ + / BLIS_DEFAULT_MAXR_D ) +#define BLIS_POOL_3M_KC_Z ( ( BLIS_MAXIMUM_3M_KC_Z * BLIS_PACKDIM_KR_D ) \ + / BLIS_DEFAULT_KR_D ) + + // Now, we compute the size of each block/panel of A, B, and C for each // datatype. @@ -168,19 +210,12 @@ // // Compute memory pool block sizes for single real. // + #define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \ - ( BLIS_POOL_KC_S + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_S \ - ) \ - ) * \ + BLIS_POOL_KC_S * \ BLIS_SIZEOF_S \ ) -#define BLIS_KN_BLOCK_SIZE_S ( ( BLIS_POOL_KC_S + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_S \ - ) \ - ) * \ +#define BLIS_KN_BLOCK_SIZE_S ( BLIS_POOL_KC_S * \ BLIS_POOL_NC_S * \ BLIS_SIZEOF_S \ ) @@ -192,19 +227,12 @@ // // Compute memory pool block sizes for double real. // + #define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \ - ( BLIS_POOL_KC_D + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_D \ - ) \ - ) * \ + BLIS_POOL_KC_D * \ BLIS_SIZEOF_D \ ) -#define BLIS_KN_BLOCK_SIZE_D ( ( BLIS_POOL_KC_D + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_D \ - ) \ - ) * \ +#define BLIS_KN_BLOCK_SIZE_D ( BLIS_POOL_KC_D * \ BLIS_POOL_NC_D * \ BLIS_SIZEOF_D \ ) @@ -216,19 +244,12 @@ // // Compute memory pool block sizes for single complex. // + #define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \ - ( BLIS_POOL_KC_C + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_C \ - ) \ - ) * \ + BLIS_POOL_KC_C * \ BLIS_SIZEOF_C \ ) -#define BLIS_KN_BLOCK_SIZE_C ( ( BLIS_POOL_KC_C + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_C \ - ) \ - ) * \ +#define BLIS_KN_BLOCK_SIZE_C ( BLIS_POOL_KC_C * \ BLIS_POOL_NC_C * \ BLIS_SIZEOF_C \ ) @@ -238,21 +259,14 @@ ) // -// Compute memory pool block sizes for single complex. +// Compute memory pool block sizes for double complex. // + #define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \ - ( BLIS_POOL_KC_Z + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_Z \ - ) \ - ) * \ + BLIS_POOL_KC_Z * \ BLIS_SIZEOF_Z \ ) -#define BLIS_KN_BLOCK_SIZE_Z ( ( BLIS_POOL_KC_Z + \ - ( BLIS_CONTIG_STRIDE_ALIGN_SIZE / \ - BLIS_SIZEOF_Z \ - ) \ - ) * \ +#define BLIS_KN_BLOCK_SIZE_Z ( BLIS_POOL_KC_Z * \ BLIS_POOL_NC_Z * \ BLIS_SIZEOF_Z \ ) @@ -261,6 +275,90 @@ BLIS_SIZEOF_Z \ ) +// +// Compute memory pool block sizes for single complex (4m). +// + +#define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \ + BLIS_POOL_4M_KC_C * \ + BLIS_SIZEOF_C \ + ) +#define BLIS_KN_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_KC_C * \ + BLIS_POOL_4M_NC_C * \ + BLIS_SIZEOF_C \ + ) +#define BLIS_MN_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \ + BLIS_POOL_4M_NC_C * \ + BLIS_SIZEOF_C \ + ) + +// +// Compute memory pool block sizes for double complex (4m). +// + +#define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \ + BLIS_POOL_4M_KC_Z * \ + BLIS_SIZEOF_Z \ + ) +#define BLIS_KN_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_KC_Z * \ + BLIS_POOL_4M_NC_Z * \ + BLIS_SIZEOF_Z \ + ) +#define BLIS_MN_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \ + BLIS_POOL_4M_NC_Z * \ + BLIS_SIZEOF_Z \ + ) + +// +// Compute memory pool block sizes for single complex (3m). +// + +// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m. + +#define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \ + BLIS_POOL_3M_KC_C * \ + ( BLIS_SIZEOF_C * \ + 3 \ + ) / 2 \ + ) +#define BLIS_KN_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_KC_C * \ + BLIS_POOL_3M_NC_C * \ + ( BLIS_SIZEOF_C * \ + 3 \ + ) / 2 \ + ) +#define BLIS_MN_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \ + BLIS_POOL_3M_NC_C * \ + ( BLIS_SIZEOF_C * \ + 3 \ + ) / 2 \ + ) + +// +// Compute memory pool block sizes for double complex (3m). +// + +// NOTE: We scale by 3/2 because 3m requires 50% more space than 4m. + +#define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \ + BLIS_POOL_3M_KC_Z * \ + ( BLIS_SIZEOF_Z * \ + 3 \ + ) / 2 \ + ) +#define BLIS_KN_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_KC_Z * \ + BLIS_POOL_3M_NC_Z * \ + ( BLIS_SIZEOF_Z * \ + 3 \ + ) / 2 \ + ) +#define BLIS_MN_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \ + BLIS_POOL_3M_NC_Z * \ + ( BLIS_SIZEOF_Z * \ + 3 \ + ) / 2 \ + ) + // -- Maximum block size search ------------------------------------------------ @@ -283,6 +381,22 @@ #undef BLIS_MK_BLOCK_SIZE #define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_Z #endif +#if BLIS_MK_BLOCK_SIZE_4M_C > BLIS_MK_BLOCK_SIZE +#undef BLIS_MK_BLOCK_SIZE +#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_4M_C +#endif +#if BLIS_MK_BLOCK_SIZE_4M_Z > BLIS_MK_BLOCK_SIZE +#undef BLIS_MK_BLOCK_SIZE +#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_4M_Z +#endif +#if BLIS_MK_BLOCK_SIZE_3M_C > BLIS_MK_BLOCK_SIZE +#undef BLIS_MK_BLOCK_SIZE +#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_3M_C +#endif +#if BLIS_MK_BLOCK_SIZE_3M_Z > BLIS_MK_BLOCK_SIZE +#undef BLIS_MK_BLOCK_SIZE +#define BLIS_MK_BLOCK_SIZE BLIS_MK_BLOCK_SIZE_3M_Z +#endif // // Find the largest block size for panels of B. @@ -300,6 +414,22 @@ #undef BLIS_KN_BLOCK_SIZE #define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_Z #endif +#if BLIS_KN_BLOCK_SIZE_4M_C > BLIS_KN_BLOCK_SIZE +#undef BLIS_KN_BLOCK_SIZE +#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_4M_C +#endif +#if BLIS_KN_BLOCK_SIZE_4M_Z > BLIS_KN_BLOCK_SIZE +#undef BLIS_KN_BLOCK_SIZE +#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_4M_Z +#endif +#if BLIS_KN_BLOCK_SIZE_3M_C > BLIS_KN_BLOCK_SIZE +#undef BLIS_KN_BLOCK_SIZE +#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_3M_C +#endif +#if BLIS_KN_BLOCK_SIZE_3M_Z > BLIS_KN_BLOCK_SIZE +#undef BLIS_KN_BLOCK_SIZE +#define BLIS_KN_BLOCK_SIZE BLIS_KN_BLOCK_SIZE_3M_Z +#endif // // Find the largest block size for panels of C. @@ -317,6 +447,22 @@ #undef BLIS_MN_BLOCK_SIZE #define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_Z #endif +#if BLIS_MN_BLOCK_SIZE_4M_C > BLIS_MN_BLOCK_SIZE +#undef BLIS_MN_BLOCK_SIZE +#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_4M_C +#endif +#if BLIS_MN_BLOCK_SIZE_4M_Z > BLIS_MN_BLOCK_SIZE +#undef BLIS_MN_BLOCK_SIZE +#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_4M_Z +#endif +#if BLIS_MN_BLOCK_SIZE_3M_C > BLIS_MN_BLOCK_SIZE +#undef BLIS_MN_BLOCK_SIZE +#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_3M_C +#endif +#if BLIS_MN_BLOCK_SIZE_3M_Z > BLIS_MN_BLOCK_SIZE +#undef BLIS_MN_BLOCK_SIZE +#define BLIS_MN_BLOCK_SIZE BLIS_MN_BLOCK_SIZE_3M_Z +#endif // -- Compute pool sizes ------------------------------------------------------- diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 63b9230be..6931ea074 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -878,6 +878,19 @@ bli_obj_width_stored( obj ) bli_obj_pack_status( obj ) == BLIS_PACKED_COL_PANELS ) +// Check if an object is packed for 4m/3m + +#define bli_obj_is_panel_packed_4m( obj ) \ +\ + ( bli_obj_pack_status( obj ) == BLIS_PACKED_ROW_PANELS_4M || \ + bli_obj_pack_status( obj ) == BLIS_PACKED_COL_PANELS_4M ) + +#define bli_obj_is_panel_packed_3m( obj ) \ +\ + ( bli_obj_pack_status( obj ) == BLIS_PACKED_ROW_PANELS_3M || \ + bli_obj_pack_status( obj ) == BLIS_PACKED_COL_PANELS_3M ) + + // Release object's pack (and cast) memory entries back to memory manager #define bli_obj_release_pack( obj_p ) \ diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h index ea8ac35dd..8e9bd82aa 100644 --- a/frame/include/bli_scalar_macro_defs.h +++ b/frame/include/bli_scalar_macro_defs.h @@ -83,6 +83,9 @@ #include "bli_addjris.h" #include "bli_addjs.h" +#include "bli_add3ris.h" +#include "bli_add3s.h" + #include "bli_axpyris.h" #include "bli_axpys.h" #include "bli_axpyjris.h" @@ -103,6 +106,9 @@ #include "bli_copycjris.h" #include "bli_copycjs.h" +#include "bli_copyri3s.h" +#include "bli_copyjri3s.h" + #include "bli_dots.h" #include "bli_dotjs.h" @@ -129,6 +135,16 @@ #include "bli_scal2jris.h" #include "bli_scal2js.h" +#include "bli_scal2ri3s.h" +#include "bli_scal2jri3s.h" + +#include "bli_set0ris.h" +#include "bli_set0s.h" + +#include "bli_set1s.h" + +#include "bli_seti0s.h" + #include "bli_sqrt2ris.h" #include "bli_sqrt2s.h" @@ -164,6 +180,8 @@ #include "bli_xpbys_mxn.h" #include "bli_xpbys_mxn_uplo.h" +#include "bli_scalris_mxn_uplo.h" + // -- Miscellaneous macros -- diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index e6b02fe57..67e9b0052 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -244,6 +244,10 @@ typedef struct #define BLIS_BITVAL_PACKED_COLUMNS 0x30000 #define BLIS_BITVAL_PACKED_ROW_PANELS 0x40000 #define BLIS_BITVAL_PACKED_COL_PANELS 0x50000 +#define BLIS_BITVAL_PACKED_ROW_PANELS_4M 0x60000 +#define BLIS_BITVAL_PACKED_COL_PANELS_4M 0x70000 +#define BLIS_BITVAL_PACKED_ROW_PANELS_3M 0x80000 +#define BLIS_BITVAL_PACKED_COL_PANELS_3M 0x90000 #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER 0x100000 #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -346,13 +350,17 @@ typedef enum typedef enum { - BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, - BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, - BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, - BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, - BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, + BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, + BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, + BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, + BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, + BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, + BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, + BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, + BLIS_PACKED_ROW_PANELS_4M = BLIS_BITVAL_PACKED_ROW_PANELS_4M, + BLIS_PACKED_COL_PANELS_4M = BLIS_BITVAL_PACKED_COL_PANELS_4M, + BLIS_PACKED_ROW_PANELS_3M = BLIS_BITVAL_PACKED_ROW_PANELS_3M, + BLIS_PACKED_COL_PANELS_3M = BLIS_BITVAL_PACKED_COL_PANELS_3M, } pack_t; @@ -460,10 +468,10 @@ typedef struct func_s - 3 == packed by columns - 4 == packed by row panels - 5 == packed by column panels - - 6 == unused - - 7 == unused - - 8 == unused - - 9 == unused + - 6 == packed by row panels (4m) + - 7 == packed by column panels (4m) + - 8 == packed by row panels (3m) + - 9 == packed by column panels (3m) 20 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper diff --git a/frame/include/blis.h b/frame/include/blis.h index 53112413d..e097729b2 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -78,8 +78,12 @@ extern "C" { #include "bli_kernel.h" #include "bli_kernel_type_defs.h" +#include "bli_kernel_pre_macro_defs.h" #include "bli_kernel_macro_defs.h" +#include "bli_kernel_4m_macro_defs.h" +#include "bli_kernel_3m_macro_defs.h" #include "bli_kernel_post_macro_defs.h" +#include "bli_kernel_prototypes.h" // -- BLIS memory pool definitions -- diff --git a/frame/include/level0/bli_add3s.h b/frame/include/level0/bli_add3s.h new file mode 100644 index 000000000..89a0e69d5 --- /dev/null +++ b/frame/include/level0/bli_add3s.h @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_ADD3S_H +#define BLIS_ADD3S_H + +// add3s + +// Notes: +// - The first char encodes the type of a. +// - The second char encodes the type of b. +// - The third char encodes the type of c. + + +// -- (axy) = (??s) ------------------------------------------------------------ + +#define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) +#define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) +#define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) +#define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) + +#define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) + +#define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) + +#define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) +#define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) + +// -- (axy) = (??d) ------------------------------------------------------------ + +#define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) + +#define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) + +#define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) + +#define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) +#define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) + +#ifndef BLIS_ENABLE_C99_COMPLEX + +// -- (axy) = (??c) ------------------------------------------------------------ + +#define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) +#define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) +#define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) +#define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) + +#define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) + +#define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) + +#define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) +#define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) + +// -- (axy) = (??z) ------------------------------------------------------------ + +#define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) + +#define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) + +#define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) + +#define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) +#define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) + +#else // ifdef BLIS_ENABLE_C99_COMPLEX + +// -- (axy) = (??c) ------------------------------------------------------------ + +#define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } + +// -- (axy) = (??z) ------------------------------------------------------------ + +#define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } + +#define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } +#define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } + +#endif // BLIS_ENABLE_C99_COMPLEX + + +#define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) +#define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) +#define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) +#define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) + + +#endif + diff --git a/frame/include/level0/bli_constants.h b/frame/include/level0/bli_constants.h index 49a0c4d52..d75314dd6 100644 --- a/frame/include/level0/bli_constants.h +++ b/frame/include/level0/bli_constants.h @@ -148,28 +148,5 @@ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, BLIS_MINUS_TWO ) ) -// set to constant - -// set1s - -#define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) -#define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) -#define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) -#define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) - -// set0s - -#define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) -#define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) -#define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) -#define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) - -// seti0s - -#define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) -#define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) -#define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) -#define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) - #endif diff --git a/frame/include/level0/bli_set0s.h b/frame/include/level0/bli_set0s.h new file mode 100644 index 000000000..3064b8aca --- /dev/null +++ b/frame/include/level0/bli_set0s.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET0S_H +#define BLIS_SET0S_H + +#define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) +#define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) +#define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) +#define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) + +#endif + diff --git a/frame/include/level0/bli_set1s.h b/frame/include/level0/bli_set1s.h new file mode 100644 index 000000000..b7e3cdf56 --- /dev/null +++ b/frame/include/level0/bli_set1s.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET1S_H +#define BLIS_SET1S_H + +#define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) +#define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) +#define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) +#define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) + +#endif + diff --git a/frame/include/level0/bli_seti0s.h b/frame/include/level0/bli_seti0s.h new file mode 100644 index 000000000..3ec365938 --- /dev/null +++ b/frame/include/level0/bli_seti0s.h @@ -0,0 +1,44 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SETI0S_H +#define BLIS_SETI0S_H + +#define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) +#define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) +#define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) +#define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) + +#endif + diff --git a/frame/include/level0/old/bli_set0ris_mxn.h b/frame/include/level0/old/bli_set0ris_mxn.h new file mode 100644 index 000000000..b3b89c870 --- /dev/null +++ b/frame/include/level0/old/bli_set0ris_mxn.h @@ -0,0 +1,81 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET0RIS_MXN_H +#define BLIS_SET0RIS_MXN_H + +// set0ris_mxn + +#define bli_sset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + bli_sset0ris( *(ar + i*rs_a + j*cs_a), \ + *(ai + i*rs_a + j*cs_a) ); \ +} + +#define bli_dset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + bli_dset0ris( *(ar + i*rs_a + j*cs_a), \ + *(ai + i*rs_a + j*cs_a) ); \ +} + +#define bli_cset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + bli_cset0ris( *(ar + i*rs_a + j*cs_a), \ + *(ai + i*rs_a + j*cs_a) ); \ +} + +#define bli_zset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + bli_zset0ris( *(ar + i*rs_a + j*cs_a), \ + *(ai + i*rs_a + j*cs_a) ); \ +} + + +#endif diff --git a/frame/include/level0/ri/bli_add3ris.h b/frame/include/level0/ri/bli_add3ris.h new file mode 100644 index 000000000..1033b8b7a --- /dev/null +++ b/frame/include/level0/ri/bli_add3ris.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_ADD3RIS_H +#define BLIS_ADD3RIS_H + +// add3ris + +#define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ +{ \ + (cr) = (ar) + (br); \ +} + +#define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ +{ \ + (cr) = (ar) + (br); \ +} + +#define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ +{ \ + (cr) = (ar) + (br); \ + (ci) = (ai) + (bi); \ +} + +#define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ +{ \ + (cr) = (ar) + (br); \ + (ci) = (ai) + (bi); \ +} + +#endif + diff --git a/config/template/kernels/1f/bli_dotxf_opt_var1.h b/frame/include/level0/ri/bli_scalris_mxn_uplo.h similarity index 51% rename from config/template/kernels/1f/bli_dotxf_opt_var1.h rename to frame/include/level0/ri/bli_scalris_mxn_uplo.h index ee21f51f8..ceaa02896 100644 --- a/config/template/kernels/1f/bli_dotxf_opt_var1.h +++ b/frame/include/level0/ri/bli_scalris_mxn_uplo.h @@ -32,32 +32,79 @@ */ +#ifndef BLIS_SCALRIS_MXN_UPLO_H +#define BLIS_SCALRIS_MXN_UPLO_H -// -// Prototype kernel interfaces. -// -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ +// scalris_mxn_u + +#define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ +{ \ + dim_t i, j; \ \ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict beta, \ - ctype_y* restrict y, inc_t incy \ - ); + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + if ( (doff_t)j - (doff_t)i >= diagoff ) \ + { \ + bli_cscalris( *(ar), \ + *(ai), \ + *((xr) + i*rs_x + j*cs_x), \ + *((xi) + i*rs_x + j*cs_x) ); \ + } \ + } \ +} -INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 ) +#define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + if ( (doff_t)j - (doff_t)i >= diagoff ) \ + { \ + bli_zscalris( *(ar), \ + *(ai), \ + *((xr) + i*rs_x + j*cs_x), \ + *((xi) + i*rs_x + j*cs_x) ); \ + } \ + } \ +} + +// scalris_mxn_l + +#define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + if ( (doff_t)j - (doff_t)i <= diagoff ) \ + { \ + bli_cscalris( *(ar), \ + *(ai), \ + *((xr) + i*rs_x + j*cs_x), \ + *((xi) + i*rs_x + j*cs_x) ); \ + } \ + } \ +} + +#define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ +{ \ + dim_t i, j; \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + { \ + if ( (doff_t)j - (doff_t)i <= diagoff ) \ + { \ + bli_zscalris( *(ar), \ + *(ai), \ + *((xr) + i*rs_x + j*cs_x), \ + *((xi) + i*rs_x + j*cs_x) ); \ + } \ + } \ +} -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 ) #endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 ) -#endif - diff --git a/frame/include/level0/ri/bli_set0ris.h b/frame/include/level0/ri/bli_set0ris.h new file mode 100644 index 000000000..7a0e72912 --- /dev/null +++ b/frame/include/level0/ri/bli_set0ris.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SET0RIS_H +#define BLIS_SET0RIS_H + +// set0ris + +#define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) +#define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) +#define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) +#define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) + +#endif + diff --git a/frame/include/level0/ri3/bli_copyjri3s.h b/frame/include/level0/ri3/bli_copyjri3s.h new file mode 100644 index 000000000..9aca17c50 --- /dev/null +++ b/frame/include/level0/ri3/bli_copyjri3s.h @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYJRI3S_H +#define BLIS_COPYJRI3S_H + +// copyjri3s + +#define bli_scopyjri3s( ar, ai, br, bi, bri ) bli_scopyri3s( (ar), -(ai), (br), (bi), (bri) ) +#define bli_dcopyjri3s( ar, ai, br, bi, bri ) bli_dcopyri3s( (ar), -(ai), (br), (bi), (bri) ) +#define bli_ccopyjri3s( ar, ai, br, bi, bri ) bli_ccopyri3s( (ar), -(ai), (br), (bi), (bri) ) +#define bli_zcopyjri3s( ar, ai, br, bi, bri ) bli_zcopyri3s( (ar), -(ai), (br), (bi), (bri) ) + +#endif + diff --git a/frame/include/level0/ri3/bli_copyri3s.h b/frame/include/level0/ri3/bli_copyri3s.h new file mode 100644 index 000000000..9ad0425ef --- /dev/null +++ b/frame/include/level0/ri3/bli_copyri3s.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_COPYRI3S_H +#define BLIS_COPYRI3S_H + +// copyri3s + +#define bli_scopyri3s( ar, ai, br, bi, bri ) \ +{ \ + (br) = (ar); \ +} + +#define bli_dcopyri3s( ar, ai, br, bi, bri ) \ +{ \ + (br) = (ar); \ +} + +#define bli_ccopyri3s( ar, ai, br, bi, bri ) \ +{ \ + (br) = (ar); \ + (bi) = (ai); \ + (bri) = (ar) + (ai); \ +} + +#define bli_zcopyri3s( ar, ai, br, bi, bri ) \ +{ \ + (br) = (ar); \ + (bi) = (ai); \ + (bri) = (ar) + (ai); \ +} + +#endif + diff --git a/frame/include/level0/ri3/bli_scal2jri3s.h b/frame/include/level0/ri3/bli_scal2jri3s.h new file mode 100644 index 000000000..a00a38e6b --- /dev/null +++ b/frame/include/level0/ri3/bli_scal2jri3s.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2JRI3S_H +#define BLIS_SCAL2JRI3S_H + +// scal2jri3s + +#define bli_sscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ +} + +#define bli_dscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ +} + +#define bli_cscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr) + (ai) * (xi); \ + (yi) = (ai) * (xr) - (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_zscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr) + (ai) * (xi); \ + (yi) = (ai) * (xr) - (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_scscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ + (yi) = (ar) * -(xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_dzscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ + (yi) = (ar) * -(xi); \ + (yri) = (yr) + (yi); \ +} + +#endif + diff --git a/frame/include/level0/ri3/bli_scal2ri3s.h b/frame/include/level0/ri3/bli_scal2ri3s.h new file mode 100644 index 000000000..2776485a0 --- /dev/null +++ b/frame/include/level0/ri3/bli_scal2ri3s.h @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SCAL2RI3S_H +#define BLIS_SCAL2RI3S_H + +// scal2ri3s + +#define bli_sscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ +} + +#define bli_dscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ +} + +#define bli_cscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr) - (ai) * (xi); \ + (yi) = (ai) * (xr) + (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_zscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr) - (ai) * (xi); \ + (yi) = (ai) * (xr) + (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_scscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ + (yi) = (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#define bli_dzscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ +{ \ + (yr) = (ar) * (xr); \ + (yi) = (ar) * (xi); \ + (yri) = (yr) + (yi); \ +} + +#endif + diff --git a/kernels/arm/neon/3/bli_gemm_opt_4x4.c b/kernels/arm/neon/3/bli_gemm_opt_4x4.c index 96f92592d..54710b4ae 100644 --- a/kernels/arm/neon/3/bli_gemm_opt_4x4.c +++ b/kernels/arm/neon/3/bli_gemm_opt_4x4.c @@ -520,7 +520,7 @@ void bli_cgemm_opt_4x4( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -540,7 +540,7 @@ void bli_zgemm_opt_4x4( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/kernels/arm/neon/3/bli_gemm_opt_4x4.h b/kernels/arm/neon/3/bli_gemm_opt_4x4.h deleted file mode 100644 index 23f174240..000000000 --- a/kernels/arm/neon/3/bli_gemm_opt_4x4.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "arm_neon.h" - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_opt_4x4 ) diff --git a/kernels/armv7a/3/bli_cgemm_kernel_2x2.S b/kernels/armv7a/3/bli_cgemm_kernel_2x2.S new file mode 100644 index 000000000..fd2be6fab --- /dev/null +++ b/kernels/armv7a/3/bli_cgemm_kernel_2x2.S @@ -0,0 +1,502 @@ + +#define REALNAME bli_cgemm_kernel_2x2 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#define FMAC_BR fnmacs +#define FMAC_BI fmacs + +#define NN 1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmuls s24 , s1, s9 + flds s3 , [ AO, #12 ] + fmuls s17 , s0, s9 + flds s10, [ BO, #8 ] + fmuls s25 , s1, s8 + + flds s11, [ BO, #12 ] + fmuls s18 , s2, s8 + add BO , BO, #16 + fmuls s26 , s3, s9 + add AO , AO, #16 + fmuls s19 , s2, s9 + pld [ BO , #B_PRE ] + fmuls s27 , s3, s8 + + pld [ AO , #A_PRE ] + fmuls s20 , s0, s10 + flds s4 , [ AO, #0 ] + fmuls s28 , s1, s11 + flds s5 , [ AO, #4 ] + fmuls s21 , s0, s11 + flds s12, [ BO ] + fmuls s29 , s1, s10 + + flds s13, [ BO, #4 ] + fmuls s22 , s2, s10 + flds s6 , [ AO, #8 ] + fmuls s30 , s3, s11 + flds s7 , [ AO, #12 ] + fmuls s23 , s2, s11 + flds s14, [ BO, #8 ] + fmuls s31 , s3, s10 + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacs s16 , s0, s8 + pld [ BO , #B_PRE ] + fmacs s24 , s1, s9 + flds s4 , [ AO, #0 ] + fmacs s17 , s0, s9 + flds s5 , [ AO, #4 ] + fmacs s25 , s1, s8 + + flds s12, [ BO ] + fmacs s18 , s2, s8 + flds s13, [ BO, #4 ] + fmacs s26 , s3, s9 + flds s6 , [ AO, #8 ] + fmacs s19 , s2, s9 + flds s7 , [ AO, #12 ] + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + flds s14, [ BO, #8 ] + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + flds s15, [ BO, #12 ] + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + add BO , BO, #16 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + add AO , AO, #16 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + flds s0 , [ AO, #0 ] + fmacs s17 , s4, s13 + flds s1 , [ AO, #4 ] + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + flds s8 , [ BO ] + fmacs s26 , s7, s13 + flds s9 , [ BO, #4 ] + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s2 , [ AO, #8 ] + fmacs s20 , s4, s14 + flds s3 , [ AO, #12 ] + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + flds s10, [ BO, #8 ] + fmacs s29 , s5, s14 + + flds s11, [ BO, #12 ] + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + add BO , BO, #16 + fmacs s23 , s6, s15 + add AO , AO, #16 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s24 , s1, s9 + flds s3 , [ AO, #12 ] + fmacs s17 , s0, s9 + flds s10, [ BO, #8 ] + fmacs s25 , s1, s8 + + flds s11, [ BO, #12 ] + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + add BO , BO, #16 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + add AO , AO, #16 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #3 // multiply with size of complex float + + flds s0, [ PTR_ALPHA ] // load real part of alpha + flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha + ldr r4, PTR_BETA + flds s2, [ r4 ] // load real part of beta + flds s3, [ r4, #4 ] // load imag part of beta + + // Add/Sub the real and the imag parts + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + mov r4, CO1 // save pointer + fldmias CO1, { s4 - s5 } // read real and imag part from C + add CO1, CO1, r3 + + mov r2, CO2 // save pointer + fldmias CO2, { s8 - s9 } // read real and imag part from C + add CO2, CO2, r3 + + fmuls s24, s4, s2 // multiply Beta-real with C-real + fmuls s25, s5, s2 // multiply Beta-real with C-imag + fmuls s28, s8, s2 // multiply Beta-real with C-real + fmuls s29, s9, s2 // multiply Beta-real with C-imag + + FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add + FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add + FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add + FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add + + FMAC_R1 s24 , s0 , s16 + FMAC_I1 s25 , s0 , s17 + FMAC_R2 s24 , s1 , s17 + FMAC_I2 s25 , s1 , s16 + + FMAC_R1 s28 , s0 , s20 + FMAC_I1 s29 , s0 , s21 + FMAC_R2 s28 , s1 , s21 + FMAC_I2 s29 , s1 , s20 + + fldmias CO1, { s4 - s5 } // read real and imag part from C + fldmias CO2, { s8 - s9 } // read real and imag part from C + + fmuls s26, s4, s2 // multiply Beta-real with C-real + fmuls s27, s5, s2 // multiply Beta-real with C-imag + fmuls s30, s8, s2 // multiply Beta-real with C-real + fmuls s31, s9, s2 // multiply Beta-real with C-imag + + FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add + FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add + FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add + FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add + + FMAC_R1 s26 , s0 , s18 + FMAC_I1 s27 , s0 , s19 + FMAC_R2 s26 , s1 , s19 + FMAC_I2 s27 , s1 , s18 + + FMAC_R1 s30, s0 , s22 + FMAC_I1 s31, s0 , s23 + FMAC_R2 s30, s1 , s23 + FMAC_I2 s31, s1 , s22 + + mov CO1, r4 // restore pointer + mov CO2, r2 // restore pointer + fstmias CO1, { s24 - s25 } + fstmias CO2, { s28 - s29 } + add CO1, CO1, r3 + add CO2, CO2, r3 + fstmias CO1, { s26 - s27 } + fstmias CO2, { s30 - s31 } + + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #3 // multiply with size of complex float + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + +cgemm_kernel_L2_M2_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #2 + ble cgemm_kernel_L2_M2_22a + .align 5 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + +cgemm_kernel_L2_M2_22a: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_40: + + INIT2x2 + +cgemm_kernel_L2_M2_44: + + ands L , K, #7 // L = K % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/kernels/armv7a/3/bli_dgemm_kernel_4x4.S b/kernels/armv7a/3/bli_dgemm_kernel_4x4.S new file mode 100644 index 000000000..fc0282846 --- /dev/null +++ b/kernels/armv7a/3/bli_dgemm_kernel_4x4.S @@ -0,0 +1,503 @@ + +#define REALNAME bli_dgemm_kernel_4x4 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 +#define CO3 r9 +#define CO4 r12 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + fldd d8 , [ BO ] + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + + fldd d1 , [ AO, #8 ] + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmuld d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmuld d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmuld d21 , d1, d9 + add BO , BO, #32 + add AO , AO, #32 + fmuld d22 , d2, d9 + + pld [ BO , #B_PRE ] + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] + fmuld d24 , d0, d10 + fldd d5 , [ AO, #8 ] + fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] + fmuld d26 , d2, d10 + fldd d7 , [ AO, #24 ] + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + +.endm + +.macro KERNEL4x4_M2 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE+32 ] + fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] + fmacd d18 , d6, d12 + pld [ BO , #B_PRE+32 ] + fmacd d19 , d7, d12 + + fldd d8 , [ BO , #32 ] + fmacd d20 , d4, d13 + fldd d1 , [ AO, #40 ] + fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] + fmacd d22 , d6, d13 + fldd d3 , [ AO, #56 ] + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fldd d9 , [ BO, #40 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #48 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #56 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + add AO , AO, #64 + fmacd d30 , d6, d15 + add BO , BO, #64 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M1 + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + fldd d4 , [ AO ] + fmacd d18 , d2, d8 + pld [ BO , #B_PRE ] + fmacd d19 , d3, d8 + + fldd d12, [ BO ] + fmacd d20 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] + fmacd d22 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] + fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] + fmacd d27 , d3, d10 + + fldd d15, [ BO, #24 ] + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + +.endm + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + fmacd d17 , d5, d12 + add BO , BO, #32 + fmacd d18 , d6, d12 + add AO , AO, #32 + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_SUB + + fldd d8 , [ BO ] + pld [ BO , #B_PRE ] + + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmacd d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmacd d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #3 // multiply with size of double + + fldd d0, [ PTR_ALPHA ] // load alpha + ldr r4, PTR_BETA + fldd d1, [ r4 ] // load beta + +//----------------------------------------------------------- + mov r2, CO1 // save pointer + mov r4, CO2 // save pointer + fldd d8, [ CO1 ] // load value from C + fldd d12, [ CO2 ] // load value from C + fmuld d8, d8, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d8, d0, d16 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d9, [ CO1 ] // load value from C + fldd d13, [ CO2 ] // load value from C + fmuld d9, d9, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d9, d0, d17 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d10, [ CO1 ] // load value from C + fldd d14, [ CO2 ] // load value from C + fmuld d10, d10, d1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacd d10, d0, d18 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + fldd d11, [ CO1 ] // load value from C + fldd d15, [ CO2 ] // load value from C + fmuld d11, d11, d1 // multiply with beta + mov CO1, r2 // restore pointer + fmacd d11, d0, d19 // multiply sum with alpha and add to value of C + mov CO2, r4 // restore pointer + + fstd d8, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d9, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d10, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fstd d11, [ CO1 ] // store value in C + +//----------------------------------------------------------- + mov r2, CO3 // save pointer + fldd d8, [ CO3 ] // load value from C + fmuld d12, d12, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d12, d0, d20 // multiply sum with alpha and add to value of C + + fldd d9, [ CO3 ] // load value from C + fmuld d13, d13, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d13, d0, d21 // multiply sum with alpha and add to value of C + + fldd d10, [ CO3 ] // load value from C + fmuld d14, d14, d1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacd d14, d0, d22 // multiply sum with alpha and add to value of C + + fldd d11, [ CO3 ] // load value from C + fmuld d15, d15, d1 // multiply with beta + mov CO3, r2 // restore pointer + fmacd d15, d0, d23 // multiply sum with alpha and add to value of C + + fstd d12, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d13, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d14, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fstd d15, [ CO2 ] // store value in C + +//----------------------------------------------------------- + mov r4, CO4 // save pointer + fldd d12, [ CO4 ] // load value from C + fmuld d8, d8, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d8, d0, d24 // multiply sum with alpha and add to value of C + + fldd d13, [ CO4 ] // load value from C + fmuld d9, d9, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d9, d0, d25 // multiply sum with alpha and add to value of C + + fldd d14, [ CO4 ] // load value from C + fmuld d10, d10, d1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacd d10, d0, d26 // multiply sum with alpha and add to value of C + + fldd d15, [ CO4 ] // load value from C + fmuld d11, d11, d1 // multiply with beta + mov CO4, r4 // restore pointer + fmacd d11, d0, d27 // multiply sum with alpha and add to value of C + + +//----------------------------------------------------------- + fstd d8, [ CO3 ] // store value in C + fmuld d12, d12, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d12, d0, d28 // multiply sum with alpha and add to value of C + + fstd d9, [ CO3 ] // store value in C + fmuld d13, d13, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d13, d0, d29 // multiply sum with alpha and add to value of C + + fstd d10, [ CO3 ] // store value in C + fmuld d14, d14, d1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacd d14, d0, d30 // multiply sum with alpha and add to value of C + + fstd d11, [ CO3 ] // store value in C + fmuld d15, d15, d1 // multiply with beta + fstd d12, [ CO4 ] // store value in C + fmacd d15, d0, d31 // multiply sum with alpha and add to value of C + + add CO4 , CO4, r3 // compute next pointer + fstd d13, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fstd d14, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fstd d15, [ CO4 ] // store value in C + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #3 // multiply with size of double + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + add CO3, CO2, r3 // third line of C + add CO4, CO3, r3 // fourth line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + +dgemm_kernel_L4_M4_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt dgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst L, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_40: + + INIT4x4 + +dgemm_kernel_L4_M4_44: + + ands L , K, #7 // L = K % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne dgemm_kernel_L4_M4_46 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/kernels/armv7a/3/bli_gemm_opt_4x4.c b/kernels/armv7a/3/bli_gemm_opt_4x4.c new file mode 100644 index 000000000..e1ea2d309 --- /dev/null +++ b/kernels/armv7a/3/bli_gemm_opt_4x4.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2012, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern void bli_sgemm_kernel_4x4(dim_t k, + float* alpha, + float* restrict a, + float* restrict b, + float* beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_sgemm_opt_4x4( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_sgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data); + +} + +extern void bli_dgemm_kernel_4x4(dim_t k, + double* alpha, + double* restrict a, + double* restrict b, + double* beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_dgemm_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + bli_dgemm_kernel_4x4(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + +extern void bli_cgemm_kernel_2x2(dim_t k, + scomplex* alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + + + +void bli_cgemm_opt_4x4( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_cgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + +extern void bli_zgemm_kernel_2x2(dim_t k, + dcomplex* alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ); + + +void bli_zgemm_opt_4x4( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) +{ + + bli_zgemm_kernel_2x2(k, alpha, a, b, beta, c, rs_c, cs_c, data); +} + diff --git a/kernels/armv7a/3/bli_sgemm_kernel_4x4.S b/kernels/armv7a/3/bli_sgemm_kernel_4x4.S new file mode 100644 index 000000000..0cbc30b83 --- /dev/null +++ b/kernels/armv7a/3/bli_sgemm_kernel_4x4.S @@ -0,0 +1,483 @@ + +#define REALNAME bli_sgemm_kernel_4x4 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 +#define CO3 r9 +#define CO4 r12 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + pld [ AO , #A_PRE ] + fldmias AO!, { s0 - s1 } + pld [ BO , #B_PRE ] + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s3 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias BO!, { s8 - s11 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + //fldmias AO!, { s2 - s3 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + //fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s7 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + //fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + + +.macro SAVE4x4 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #2 // multiply with size of float + + flds s0, [ PTR_ALPHA ] // load alpha + ldr r4, PTR_BETA + flds s1, [ r4 ] // load beta + +//----------------------------------------------------------- + mov r2, CO1 // save pointer + mov r4, CO2 // save pointer + flds s8, [ CO1 ] // load value from C + flds s12, [ CO2 ] // load value from C + fmuls s8, s8, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s8, s0, s16 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s9, [ CO1 ] // load value from C + flds s13, [ CO2 ] // load value from C + fmuls s9, s9, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s9, s0, s17 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s10, [ CO1 ] // load value from C + flds s14, [ CO2 ] // load value from C + fmuls s10, s10, s1 // multiply with beta + add CO1, CO1, r3 // compute next pointer + fmacs s10, s0, s18 // multiply sum with alpha and add to value of C + add CO2, CO2, r3 // compute next pointer + + flds s11, [ CO1 ] // load value from C + flds s15, [ CO2 ] // load value from C + fmuls s11, s11, s1 // multiply with beta + mov CO1, r2 // restore pointer + fmacs s11, s0, s19 // multiply sum with alpha and add to value of C + mov CO2, r4 // restore pointer + + fsts s8, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s9, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s10, [ CO1 ] // store value in C + add CO1 , CO1, r3 // compute next pointer + fsts s11, [ CO1 ] // store value in C + +//----------------------------------------------------------- + mov r2, CO3 // save pointer + flds s8, [ CO3 ] // load value from C + fmuls s12, s12, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s12, s0, s20 // multiply sum with alpha and add to value of C + + flds s9, [ CO3 ] // load value from C + fmuls s13, s13, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s13, s0, s21 // multiply sum with alpha and add to value of C + + flds s10, [ CO3 ] // load value from C + fmuls s14, s14, s1 // multiply with beta + add CO3, CO3, r3 // compute next pointer + fmacs s14, s0, s22 // multiply sum with alpha and add to value of C + + flds s11, [ CO3 ] // load value from C + fmuls s15, s15, s1 // multiply with beta + mov CO3, r2 // restore pointer + fmacs s15, s0, s23 // multiply sum with alpha and add to value of C + + fsts s12, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s13, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s14, [ CO2 ] // store value in C + add CO2 , CO2, r3 // compute next pointer + fsts s15, [ CO2 ] // store value in C + +//----------------------------------------------------------- + mov r4, CO4 // save pointer + flds s12, [ CO4 ] // load value from C + fmuls s8, s8, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s8, s0, s24 // multiply sum with alpha and add to value of C + + flds s13, [ CO4 ] // load value from C + fmuls s9, s9, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s9, s0, s25 // multiply sum with alpha and add to value of C + + flds s14, [ CO4 ] // load value from C + fmuls s10, s10, s1 // multiply with beta + add CO4, CO4, r3 // compute next pointer + fmacs s10, s0, s26 // multiply sum with alpha and add to value of C + + flds s15, [ CO4 ] // load value from C + fmuls s11, s11, s1 // multiply with beta + mov CO4, r4 // restore pointer + fmacs s11, s0, s27 // multiply sum with alpha and add to value of C + + +//----------------------------------------------------------- + fsts s8, [ CO3 ] // store value in C + fmuls s12, s12, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s12, s0, s28 // multiply sum with alpha and add to value of C + + fsts s9, [ CO3 ] // store value in C + fmuls s13, s13, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s13, s0, s29 // multiply sum with alpha and add to value of C + + fsts s10, [ CO3 ] // store value in C + fmuls s14, s14, s1 // multiply with beta + add CO3 , CO3, r3 // compute next pointer + fmacs s14, s0, s30 // multiply sum with alpha and add to value of C + + fsts s11, [ CO3 ] // store value in C + fmuls s15, s15, s1 // multiply with beta + fsts s12, [ CO4 ] // store value in C + fmacs s15, s0, s31 // multiply sum with alpha and add to value of C + + add CO4 , CO4, r3 // compute next pointer + fsts s13, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fsts s14, [ CO4 ] // store value in C + add CO4 , CO4, r3 // compute next pointer + fsts s15, [ CO4 ] // store value in C + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { s8 - s31 } // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #2 // multiply with size of float + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + add CO3, CO2, r3 // third line of C + add CO4, CO3, r3 // fourth line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + pld [ CO3, #C_PRE ] // prefetch the lines of C + +sgemm_kernel_L4_M4_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst L, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands L , K, #7 // L = K % 8 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31 } // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/kernels/armv7a/3/bli_zgemm_kernel_2x2.S b/kernels/armv7a/3/bli_zgemm_kernel_2x2.S new file mode 100644 index 000000000..042827d0e --- /dev/null +++ b/kernels/armv7a/3/bli_zgemm_kernel_2x2.S @@ -0,0 +1,506 @@ + +#define REALNAME bli_zgemm_kernel_2x2 + +#define STACKSIZE 256 + +#define K r0 +#define PTR_ALPHA r1 +#define OLD_A r2 +#define OLD_B r3 +#define PTR_BETA [fp, #0 ] +#define OLD_C [fp, #4 ] +#define OLD_RSC [fp, #8 ] +#define OLD_CSC [fp, #12 ] +#define AUX [fp, #16 ] + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* register +*******************************************************/ + +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r7 +#define CO2 r8 + + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 0 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#define FMAC_BR fnmacd +#define FMAC_BI fmacd + +#define NN 1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3, OLD_RSC // Row stride size + lsl r3, r3, #4 // multiply with size of complex double + + fldd d0, [ PTR_ALPHA ] // load real part of alpha + fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha + ldr r4, PTR_BETA + fldd d2, [ r4 ] // load real part of beta + fldd d3, [ r4, #8 ] // load imag part of beta + + // Add/Sub the real and the imag parts + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + mov r4, CO1 // save pointer + fldmiad CO1, { d4 - d5 } // read real and imag part from C + add CO1, CO1, r3 + + mov r2, CO2 // save pointer + fldmiad CO2, { d8 - d9 } // read real and imag part from C + add CO2, CO2, r3 + + fmuld d24, d4, d2 // multiply Beta-real with C-real + fmuld d25, d5, d2 // multiply Beta-real with C-imag + fmuld d28, d8, d2 // multiply Beta-real with C-real + fmuld d29, d9, d2 // multiply Beta-real with C-imag + + FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add + FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add + FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add + FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add + + FMAC_R1 d24 , d0 , d16 + FMAC_I1 d25 , d0 , d17 + FMAC_R2 d24 , d1 , d17 + FMAC_I2 d25 , d1 , d16 + + FMAC_R1 d28 , d0 , d20 + FMAC_I1 d29 , d0 , d21 + FMAC_R2 d28 , d1 , d21 + FMAC_I2 d29 , d1 , d20 + + fldmiad CO1, { d4 - d5 } // read real and imag part from C + fldmiad CO2, { d8 - d9 } // read real and imag part from C + + fmuld d26, d4, d2 // multiply Beta-real with C-real + fmuld d27, d5, d2 // multiply Beta-real with C-imag + fmuld d30, d8, d2 // multiply Beta-real with C-real + fmuld d31, d9, d2 // multiply Beta-real with C-imag + + FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add + FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add + FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add + FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add + + FMAC_R1 d26 , d0 , d18 + FMAC_I1 d27 , d0 , d19 + FMAC_R2 d26 , d1 , d19 + FMAC_I2 d27 , d1 , d18 + + FMAC_R1 d30, d0 , d22 + FMAC_I1 d31, d0 , d23 + FMAC_R2 d30, d1 , d23 + FMAC_I2 d31, d1 , d22 + + mov CO1, r4 // restore pointer + mov CO2, r2 // restore pointer + fstmiad CO1, { d24 - d25 } + fstmiad CO2, { d28 - d29 } + add CO1, CO1, r3 + add CO2, CO2, r3 + fstmiad CO1, { d26 - d27 } + fstmiad CO2, { d30 - d31 } + + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + .arm + .global REALNAME + .func REALNAME + +REALNAME: + + push {r4 - r9, fp} // save register + add fp, sp, #28 // add number of saved register multiplied by size of int + sub sp, sp, #STACKSIZE // reserve stack + + mov AO, OLD_A // pointer matrix A + mov BO, OLD_B // pointer matrix B + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r2, OLD_C // pointer matrix C + ldr r3, OLD_CSC // Col stride size of C + lsl r3, r3, #4 // multiply with size of complex double + + mov CO1, r2 // first line of C + add CO2, CO1, r3 // second line of C + + pld [ CO1, #C_PRE ] // prefetch the lines of C + pld [ CO2, #C_PRE ] // prefetch the lines of C + +zgemm_kernel_L2_M2_20: + + asrs L , K, #3 // L = K / 8 + cmp L , #2 + blt zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #2 + ble zgemm_kernel_L2_M2_22a + .align 5 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + +zgemm_kernel_L2_M2_22a: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_40: + + INIT2x2 + +zgemm_kernel_L2_M2_44: + + ands L , K, #7 // L = K % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + sub sp, fp, #28 + pop {r4 - r9, fp} + bx lr + diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.c b/kernels/bgq/1/bli_axpyv_opt_var1.c index 67c569033..a6fb43daa 100644 --- a/kernels/bgq/1/bli_axpyv_opt_var1.c +++ b/kernels/bgq/1/bli_axpyv_opt_var1.c @@ -34,136 +34,13 @@ #include "blis.h" -#define FUNCPTR_T axpyv_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy - ); - -// If some mixed datatype functions will not be compiled, we initialize -// the corresponding elements of the function array to NULL. -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_opt_var1); -#else -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_opt_var1); -#else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_opt_var1); -#endif -#endif - - -void bli_axpyv_opt_var1( obj_t* alpha, - obj_t* x, - obj_t* y ) -{ - num_t dt_x = bli_obj_datatype( *x ); - num_t dt_y = bli_obj_datatype( *y ); - - conj_t conjx = bli_obj_conj_status( *x ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - num_t dt_alpha; - void* buf_alpha; - - FUNCPTR_T f; - - // If alpha is a scalar constant, use dt_x to extract the address of the - // corresponding constant value; otherwise, use the datatype encoded - // within the alpha object and extract the buffer at the alpha offset. - bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_alpha][dt_x][dt_y]; - - // Invoke the function. - f( conjx, - n, - buf_alpha, - buf_x, inc_x, - buf_y, inc_y ); -} - - -#undef GENTFUNC3 -#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ -{ \ - ctype_a* alpha_cast = alpha; \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_x* chi1; \ - ctype_y* psi1; \ - dim_t i; \ -\ - if ( bli_zero_dim1( n ) ) return; \ -\ - chi1 = x_cast; \ - psi1 = y_cast; \ -\ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(cha,chx,chy,axpyjs)( *alpha_cast, *chi1, *psi1 ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ - else \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(cha,chx,chy,axpys)( *alpha_cast, *chi1, *psi1 ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ -} - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3_BASIC( axpyv, axpyv_opt_var1 ) -GENTFUNC3( float, float, float, s, s, s, axpyv, axpyv_opt_var1 ) -//GENTFUNC3( double, double, double, d, d, d, axpyv, axpyv_opt_var1 ) -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, axpyv, axpyv_opt_var1 ) -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, axpyv, axpyv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( axpyv, axpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( axpyv, axpyv_opt_var1 ) -#endif - -void bli_dddaxpyv_opt_var1( - conj_t conjx, - dim_t n, - void* alpha_in, - void* x_in, inc_t incx, - void* y_in, inc_t incy - ) +void bli_daxpyv_opt_var1( + conj_t conjx, + dim_t n, + double* restrict alpha_in, + double* restrict x_in, inc_t incx, + double* restrict y_in, inc_t incy + ) { double* restrict alpha = alpha_in; double* restrict x = x_in; @@ -180,7 +57,7 @@ void bli_dddaxpyv_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { printf("Defaulting to reference!"); - bli_dddaxpyv_unb_var1( conjx, n, alpha, x, incx, y, incy ); + BLIS_DAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy ); return; } diff --git a/kernels/bgq/1/bli_axpyv_opt_var1.h b/kernels/bgq/1/bli_axpyv_opt_var1.h deleted file mode 100644 index 486d79159..000000000 --- a/kernels/bgq/1/bli_axpyv_opt_var1.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_axpyv_opt_var1( obj_t* alpha, - obj_t* x, - obj_t* y ); - - -#undef GENTPROT3 -#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( axpyv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 ) -#endif - diff --git a/kernels/bgq/1/bli_dotv_opt_var1.c b/kernels/bgq/1/bli_dotv_opt_var1.c index 3c4f0bdb3..7fceb2ec3 100644 --- a/kernels/bgq/1/bli_dotv_opt_var1.c +++ b/kernels/bgq/1/bli_dotv_opt_var1.c @@ -34,145 +34,14 @@ #include "blis.h" -#define FUNCPTR_T dotv_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - conj_t conjy, - dim_t n, - void* x, inc_t incx, - void* y, inc_t incy, - void* rho - ); - -// If some mixed datatype functions will not be compiled, we initialize -// the corresponding elements of the function array to NULL. -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_opt_var1); -#else -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_opt_var1); -#else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_opt_var1); -#endif -#endif - - -void bli_dotv_opt_var1( obj_t* x, - obj_t* y, - obj_t* rho ) -{ - num_t dt_x = bli_obj_datatype( *x ); - num_t dt_y = bli_obj_datatype( *y ); - num_t dt_rho = bli_obj_datatype( *rho ); - - conj_t conjx = bli_obj_conj_status( *x ); - conj_t conjy = bli_obj_conj_status( *y ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - void* buf_rho = bli_obj_buffer_at_off( *rho ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_x][dt_y][dt_rho]; - - // Invoke the function. - f( conjx, - conjy, - n, - buf_x, inc_x, - buf_y, inc_y, - buf_rho ); -} - - -#undef GENTFUNC3 -#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \ -\ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho \ - ) \ -{ \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_r* rho_cast = rho; \ - ctype_x* chi1; \ - ctype_y* psi1; \ - ctype_r dotxy; \ - dim_t i; \ - conj_t conjx_use; \ -\ - if ( bli_zero_dim1( n ) ) \ - { \ - PASTEMAC(chr,set0s)( *rho_cast ); \ - return; \ - } \ -\ - PASTEMAC(chr,set0s)( dotxy ); \ -\ - chi1 = x_cast; \ - psi1 = y_cast; \ -\ - conjx_use = conjx; \ -\ - /* If y must be conjugated, we do so indirectly by first toggling the - effective conjugation of x and then conjugating the resulting dot - product. */ \ - if ( bli_is_conj( conjy ) ) \ - bli_toggle_conj( conjx_use ); \ -\ - if ( bli_is_conj( conjx_use ) ) \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(chx,chy,chr,dotjs)( *chi1, *psi1, dotxy ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ - else \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(chx,chy,chr,dots)( *chi1, *psi1, dotxy ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ -\ - if ( bli_is_conj( conjy ) ) \ - PASTEMAC(chr,conjs)( dotxy ); \ -\ - PASTEMAC2(chr,chr,copys)( dotxy, *rho_cast ); \ -} - - - - - -void bli_ddddotv_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - void* x_in, inc_t incx, - void* y_in, inc_t incy, - void* rho_in - ) +void bli_ddotv_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict x_in, inc_t incx, + double* restrict y_in, inc_t incy, + double* restrict rho_in + ) { double* restrict x = x_in; double* restrict y = y_in; @@ -190,7 +59,7 @@ void bli_ddddotv_opt_var1( use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref ) { - bli_ddddotv_unb_var1( conjx, conjy, n, x, incx, y, incy, rho ); + BLIS_DDOTV_KERNEL_REF( conjx, conjy, n, x, incx, y, incy, rho ); return; } @@ -227,19 +96,3 @@ void bli_ddddotv_opt_var1( *rho = rhos; } - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3_BASIC( dotv, dotv_opt_var1 ) -GENTFUNC3( float, float, float, s, s, s, dotv, dotv_opt_var1 ) -//GENTFUNC3( double, double, double, d, d, d, dotv, dotv_opt_var1 ) -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, dotv, dotv_opt_var1 ) -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, dotv, dotv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( dotv, dotv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( dotv, dotv_opt_var1 ) -#endif diff --git a/kernels/bgq/1/bli_dotv_opt_var1.h b/kernels/bgq/1/bli_dotv_opt_var1.h deleted file mode 100644 index 1174ef1e7..000000000 --- a/kernels/bgq/1/bli_dotv_opt_var1.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_dotv_opt_var1( obj_t* x, - obj_t* y, - obj_t* rho ); - - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \ -\ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho \ - ); - -INSERT_GENTPROT3_BASIC( dotv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( dotv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( dotv_opt_var1 ) -#endif - diff --git a/kernels/bgq/1f/bli_axpyf_opt_var1.c b/kernels/bgq/1f/bli_axpyf_opt_var1.c index fb3d42f5c..46b837689 100644 --- a/kernels/bgq/1f/bli_axpyf_opt_var1.c +++ b/kernels/bgq/1f/bli_axpyf_opt_var1.c @@ -36,19 +36,19 @@ -void bli_sssaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy - ) +void bli_saxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + float* restrict alpha, + float* restrict a, inc_t inca, inc_t lda, + float* restrict x, inc_t incx, + float* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_sssaxpyf_unb_var1( conja, + BLIS_SAXPYF_KERNEL_REF( conja, conjx, m, b_n, @@ -60,16 +60,16 @@ void bli_sssaxpyf_opt_var1( -void bli_dddaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy - ) +void bli_daxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy + ) { if ( bli_zero_dim2( m, b_n ) ) return; @@ -84,7 +84,7 @@ void bli_dddaxpyf_opt_var1( { // printf("%d\t%d\t%d\t%d\t%d\t%d\n", PASTEMAC(d, axpyf_fusefac), inca, incx, incy, bli_is_unaligned_to(a, 32), bli_is_unaligned_to( y, 32)); // printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n"); - PASTEMAC3(d,d,d,axpyf_unb_var1)( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy ); + BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy ); return; } @@ -172,19 +172,19 @@ void bli_dddaxpyf_opt_var1( -void bli_cccaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy - ) +void bli_caxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + scomplex* restrict alpha, + scomplex* restrict a, inc_t inca, inc_t lda, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_cccaxpyf_unb_var1( conja, + BLIS_CAXPYF_KERNEL_REF( conja, conjx, m, b_n, @@ -195,19 +195,19 @@ void bli_cccaxpyf_opt_var1( } -void bli_zzzaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy - ) +void bli_zaxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + dcomplex* restrict alpha, + dcomplex* restrict a, inc_t inca, inc_t lda, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy + ) { /* Just call the reference implementation. */ - bli_zzzaxpyf_unb_var1( conja, + BLIS_ZAXPYF_KERNEL_REF( conja, conjx, m, b_n, @@ -217,41 +217,3 @@ void bli_zzzaxpyf_opt_var1( y, incy ); } - - -// -// Define BLAS-like interfaces with heterogeneous-typed operands. -// -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ) \ -{ \ - /* Just call the reference implementation. */ \ - PASTEMAC3(cha,chx,chy,kername)( conja, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy ); \ -} - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 ) -#endif - diff --git a/kernels/bgq/1f/bli_axpyf_opt_var1.h b/kernels/bgq/1f/bli_axpyf_opt_var1.h deleted file mode 100644 index 539cd38ea..000000000 --- a/kernels/bgq/1f/bli_axpyf_opt_var1.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype axpyf kernel interfaces. -// -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 ) -#endif - diff --git a/kernels/bgq/3/bli_gemm_8x8.c b/kernels/bgq/3/bli_gemm_8x8.c index bda491d93..2c1842f41 100644 --- a/kernels/bgq/3/bli_gemm_8x8.c +++ b/kernels/bgq/3/bli_gemm_8x8.c @@ -33,7 +33,6 @@ */ #include "blis.h" -#undef restrict void bli_sgemm_8x8( dim_t k, @@ -46,7 +45,7 @@ void bli_sgemm_8x8( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a, b, @@ -253,7 +252,7 @@ void bli_cgemm_8x8( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -273,7 +272,7 @@ void bli_zgemm_8x8( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, @@ -295,7 +294,7 @@ void bli_sgemm_8x8_mt( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a, b, @@ -316,7 +315,7 @@ void bli_cgemm_8x8_mt( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -337,7 +336,7 @@ void bli_zgemm_8x8_mt( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/kernels/bgq/3/bli_gemm_8x8.h b/kernels/bgq/3/bli_gemm_8x8.h index be3a702c4..75401eecb 100644 --- a/kernels/bgq/3/bli_gemm_8x8.h +++ b/kernels/bgq/3/bli_gemm_8x8.h @@ -35,6 +35,7 @@ #include "blis.h" +/* #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ @@ -49,6 +50,7 @@ void PASTEMAC(ch,varname)( \ ); INSERT_GENTPROT_BASIC( gemm_8x8 ) +*/ #undef GENTPROT diff --git a/kernels/c99/3/bli_gemm_ref_4x4.h b/kernels/c99/3/bli_gemm_ref_4x4.h deleted file mode 100644 index fb39dcc92..000000000 --- a/kernels/c99/3/bli_gemm_ref_4x4.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_ref_4x4 ) - diff --git a/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.h b/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.h deleted file mode 100644 index 41840606b..000000000 --- a/kernels/c99/3/bli_gemmtrsm_l_ref_4x4.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict aL, \ - ctype* restrict a, \ - ctype* restrict bT, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_l_ref_4x4 ) - diff --git a/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.h b/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.h deleted file mode 100644 index 82de2cec4..000000000 --- a/kernels/c99/3/bli_gemmtrsm_u_ref_4x4.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict aR, \ - ctype* restrict a, \ - ctype* restrict bB, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_u_ref_4x4 ) - diff --git a/kernels/loongson3a/3/bli_gemm_opt_d4x4.c b/kernels/loongson3a/3/bli_gemm_opt_d4x4.c index e74de298c..b42249ac3 100644 --- a/kernels/loongson3a/3/bli_gemm_opt_d4x4.c +++ b/kernels/loongson3a/3/bli_gemm_opt_d4x4.c @@ -45,7 +45,7 @@ void bli_sgemm_opt_d4x4( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a, b, @@ -547,7 +547,7 @@ void bli_cgemm_opt_d4x4( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -567,7 +567,7 @@ void bli_zgemm_opt_d4x4( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/kernels/loongson3a/3/bli_gemm_opt_d4x4.h b/kernels/loongson3a/3/bli_gemm_opt_d4x4.h deleted file mode 100644 index f9349a215..000000000 --- a/kernels/loongson3a/3/bli_gemm_opt_d4x4.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_opt_d4x4 ) - diff --git a/kernels/mic/3/bli_gemm_opt_30x8.c b/kernels/mic/3/bli_gemm_opt_30x8.c index c4d0ff11a..9a08c14b2 100644 --- a/kernels/mic/3/bli_gemm_opt_30x8.c +++ b/kernels/mic/3/bli_gemm_opt_30x8.c @@ -41,15 +41,22 @@ #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. void bli_sgemm_opt_30x8( - dim_t k, - float* alpha, - float* a, - float* b, - float* beta, - float* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data ) { + BLIS_SGEMM_UKERNEL_REF( k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data ); } //Alternate code path uused if C is not row-major @@ -267,12 +274,13 @@ int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9 //#define MONITORS //#define LOOPMON void bli_dgemm_opt_30x8( - dim_t k, double* alpha, - double* a, - double* b, - double* beta, - double* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data ) { double * a_next = bli_auxinfo_next_a( data ); @@ -586,24 +594,40 @@ void bli_dgemm_opt_30x8( void bli_cgemm_opt_30x8( - dim_t k, scomplex* alpha, - scomplex* a, - scomplex* b, - scomplex* beta, - scomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data ) { + BLIS_CGEMM_UKERNEL_REF( k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data ); } void bli_zgemm_opt_30x8( - dim_t k, dcomplex* alpha, - dcomplex* a, - dcomplex* b, - dcomplex* beta, - dcomplex* c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data ) { + BLIS_ZGEMM_UKERNEL_REF( k, + alpha, + a, + b, + beta, + c, rs_c, cs_c, + data ); } diff --git a/kernels/mic/3/bli_gemm_opt_30x8.h b/kernels/mic/3/bli_gemm_opt_30x8.h deleted file mode 100644 index 76e0e2d8e..000000000 --- a/kernels/mic/3/bli_gemm_opt_30x8.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2012, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY - OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_opt_30x8 ) - diff --git a/kernels/power7/3/bli_gemm_opt_8x4.c b/kernels/power7/3/bli_gemm_opt_8x4.c index 89876a36e..ccfffcef1 100644 --- a/kernels/power7/3/bli_gemm_opt_8x4.c +++ b/kernels/power7/3/bli_gemm_opt_8x4.c @@ -73,7 +73,7 @@ void bli_sgemm_opt_8x4( } } #else - bli_sgemm_ref_mxn(k, alpha, a, b, beta, c, rs_c, cs_c, data); + BLIS_SGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } @@ -445,7 +445,7 @@ void bli_dgemm_opt_8x4( } } #else - bli_dgemm_ref_mxn(k, alpha, a, b, beta, c, rs_c, cs_c, data); + BLIS_DGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } } @@ -495,7 +495,7 @@ void bli_cgemm_opt_8x4( } } #else - bli_cgemm_ref_mxn(k, alpha, a, b, beta, c, rs_c, cs_c, data); + BLIS_CGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } @@ -544,7 +544,7 @@ void bli_zgemm_opt_8x4( } } #else - bli_zgemm_ref_mxn(k, alpha, a, b, beta, c, rs_c, cs_c, data); + BLIS_ZGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } diff --git a/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c b/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c index fdce15a53..8315c1e2a 100644 --- a/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c +++ b/kernels/x86_64/avx/3/bli_gemm_opt_8x4_ref_u4_nodupl_avx1.c @@ -48,7 +48,7 @@ void bli_sgemm_opt_8x4_ref_u4_nodupl_avx1( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a, b, @@ -642,7 +642,7 @@ void bli_cgemm_opt_8x4_ref_u4_nodupl_avx1( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -664,7 +664,7 @@ void bli_zgemm_opt_8x4_ref_u4_nodupl_avx1( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.c b/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.c index b21d2d4c4..03d6bea14 100644 --- a/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.c @@ -34,129 +34,6 @@ #include "blis.h" -#define FUNCPTR_T axpyv_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy - ); - -// If some mixed datatype functions will not be compiled, we initialize -// the corresponding elements of the function array to NULL. -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_opt_var1); -#else -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_opt_var1); -#else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_opt_var1); -#endif -#endif - - -void bli_axpyv_opt_var1( obj_t* alpha, - obj_t* x, - obj_t* y ) -{ - num_t dt_x = bli_obj_datatype( *x ); - num_t dt_y = bli_obj_datatype( *y ); - - conj_t conjx = bli_obj_conj_status( *x ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - num_t dt_alpha; - void* buf_alpha; - - FUNCPTR_T f; - - // If alpha is a scalar constant, use dt_x to extract the address of the - // corresponding constant value; otherwise, use the datatype encoded - // within the alpha object and extract the buffer at the alpha offset. - bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_alpha][dt_x][dt_y]; - - // Invoke the function. - f( conjx, - n, - buf_alpha, - buf_x, inc_x, - buf_y, inc_y ); -} - - -#undef GENTFUNC3 -#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ -{ \ - ctype_a* alpha_cast = alpha; \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_x* chi1; \ - ctype_y* psi1; \ - dim_t i; \ -\ - if ( bli_zero_dim1( n ) ) return; \ -\ - chi1 = x_cast; \ - psi1 = y_cast; \ -\ - if ( bli_is_conj( conjx ) ) \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(cha,chx,chy,axpyjs)( *alpha_cast, *chi1, *psi1 ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ - else \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(cha,chx,chy,axpys)( *alpha_cast, *chi1, *psi1 ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ -} - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3_BASIC( axpyv, axpyv_opt_var1 ) -GENTFUNC3( float, float, float, s, s, s, axpyv, axpyv_opt_var1 ) -//GENTFUNC3( double, double, double, d, d, d, axpyv, axpyv_opt_var1 ) -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, axpyv, axpyv_opt_var1 ) -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, axpyv, axpyv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( axpyv, axpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( axpyv, axpyv_opt_var1 ) -#endif - #include "pmmintrin.h" typedef union @@ -166,13 +43,13 @@ typedef union } v2df_t; -void bli_dddaxpyv_opt_var1( - conj_t conjx, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy - ) +void bli_daxpyv_opt_var1( + conj_t conjx, + dim_t n, + double* restrict alpha, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy + ) { double* restrict alpha_cast = alpha; double* restrict x_cast = x; @@ -223,11 +100,11 @@ void bli_dddaxpyv_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_dddaxpyv_unb_var1( conjx, - n, - alpha, - x, incx, - y, incy ); + BLIS_DAXPYV_KERNEL_REF( conjx, + n, + alpha, + x, incx, + y, incy ); return; } diff --git a/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.h b/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.h deleted file mode 100644 index 486d79159..000000000 --- a/kernels/x86_64/core2-sse3/1/bli_axpyv_opt_var1.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_axpyv_opt_var1( obj_t* alpha, - obj_t* x, - obj_t* y ); - - -#undef GENTPROT3 -#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjx, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( axpyv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.c b/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.c index ee70ee798..737df5d15 100644 --- a/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.c @@ -34,151 +34,6 @@ #include "blis.h" -#define FUNCPTR_T dotv_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - conj_t conjy, - dim_t n, - void* x, inc_t incx, - void* y, inc_t incy, - void* rho - ); - -// If some mixed datatype functions will not be compiled, we initialize -// the corresponding elements of the function array to NULL. -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_opt_var1); -#else -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_opt_var1); -#else -static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_opt_var1); -#endif -#endif - - -void bli_dotv_opt_var1( obj_t* x, - obj_t* y, - obj_t* rho ) -{ - num_t dt_x = bli_obj_datatype( *x ); - num_t dt_y = bli_obj_datatype( *y ); - num_t dt_rho = bli_obj_datatype( *rho ); - - conj_t conjx = bli_obj_conj_status( *x ); - conj_t conjy = bli_obj_conj_status( *y ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - void* buf_rho = bli_obj_buffer_at_off( *rho ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_x][dt_y][dt_rho]; - - // Invoke the function. - f( conjx, - conjy, - n, - buf_x, inc_x, - buf_y, inc_y, - buf_rho ); -} - - -#undef GENTFUNC3 -#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \ -\ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho \ - ) \ -{ \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_r* rho_cast = rho; \ - ctype_x* chi1; \ - ctype_y* psi1; \ - ctype_r dotxy; \ - dim_t i; \ - conj_t conjx_use; \ -\ - if ( bli_zero_dim1( n ) ) \ - { \ - PASTEMAC(chr,set0s)( *rho_cast ); \ - return; \ - } \ -\ - PASTEMAC(chr,set0s)( dotxy ); \ -\ - chi1 = x_cast; \ - psi1 = y_cast; \ -\ - conjx_use = conjx; \ -\ - /* If y must be conjugated, we do so indirectly by first toggling the - effective conjugation of x and then conjugating the resulting dot - product. */ \ - if ( bli_is_conj( conjy ) ) \ - bli_toggle_conj( conjx_use ); \ -\ - if ( bli_is_conj( conjx_use ) ) \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(chx,chy,chr,dotjs)( *chi1, *psi1, dotxy ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ - else \ - { \ - for ( i = 0; i < n; ++i ) \ - { \ - PASTEMAC3(chx,chy,chr,dots)( *chi1, *psi1, dotxy ); \ -\ - chi1 += incx; \ - psi1 += incy; \ - } \ - } \ -\ - if ( bli_is_conj( conjy ) ) \ - PASTEMAC(chr,conjs)( dotxy ); \ -\ - PASTEMAC2(chr,chr,copys)( dotxy, *rho_cast ); \ -} - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3_BASIC( dotv, dotv_opt_var1 ) -GENTFUNC3( float, float, float, s, s, s, dotv, dotv_opt_var1 ) -//GENTFUNC3( double, double, double, d, d, d, dotv, dotv_opt_var1 ) -GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, dotv, dotv_opt_var1 ) -GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, dotv, dotv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3_MIX_D( dotv, dotv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3_MIX_P( dotv, dotv_opt_var1 ) -#endif - - - #include "pmmintrin.h" typedef union @@ -188,14 +43,14 @@ typedef union } v2df_t; -void bli_ddddotv_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - void* x, inc_t incx, - void* y, inc_t incy, - void* rho - ) +void bli_ddotv_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict rho + ) { double* restrict x_cast = x; double* restrict y_cast = y; @@ -247,12 +102,12 @@ void bli_ddddotv_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_ddddotv_unb_var1( conjx, - conjy, - n, - x, incx, - y, incy, - rho ); + BLIS_DDOTV_KERNEL_REF( conjx, + conjy, + n, + x, incx, + y, incy, + rho ); return; } diff --git a/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.h b/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.h deleted file mode 100644 index 1174ef1e7..000000000 --- a/kernels/x86_64/core2-sse3/1/bli_dotv_opt_var1.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_dotv_opt_var1( obj_t* x, - obj_t* y, - obj_t* rho ); - - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \ -\ -void PASTEMAC3(chx,chy,chr,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho \ - ); - -INSERT_GENTPROT3_BASIC( dotv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( dotv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( dotv_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.c b/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.c index a0fbfdfe6..b2f3104c3 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.c @@ -34,121 +34,6 @@ #include "blis.h" -/* -#define FUNCPTR_T axpy2v_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - conj_t conjy, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy - ); - -// If some mixed datatype functions will not be compiled, we initialize -// the corresponding elements of the function array to NULL. -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -static FUNCPTR_T GENARRAY3_ALL(ftypes,axpy2v_unb_var1); -#else -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -static FUNCPTR_T GENARRAY3_EXT(ftypes,axpy2v_unb_var1); -#else -static FUNCPTR_T GENARRAY3_MIN(ftypes,axpy2v_unb_var1); -#endif -#endif - - -void bli_axpy2v_unb_var1( obj_t* alpha, - obj_t* x, - obj_t* y ) -{ - num_t dt_x = bli_obj_datatype( *x ); - num_t dt_y = bli_obj_datatype( *y ); - - conj_t conjx = bli_obj_conj_status( *x ); - conj_t conjy = bli_obj_conj_status( *y ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - num_t dt_alpha; - void* buf_alpha; - - FUNCPTR_T f; - - // If alpha is a scalar constant, use dt_x to extract the address of the - // corresponding constant value; otherwise, use the datatype encoded - // within the alpha object and extract the buffer at the alpha offset. - bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha ); - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt_alpha][dt_x][dt_y]; - - // Invoke the function. - f( conjx, - conjy, - n, - buf_alpha, - buf_x, inc_x, - buf_y, inc_y ); -} -*/ - - -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha1, \ - void* alpha2, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ -{ \ - ctype_xy* alpha1_cast = alpha1; \ - ctype_xy* alpha2_cast = alpha2; \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_z* z_cast = z; \ -\ - PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \ - n, \ - alpha1_cast, \ - x_cast, incx, \ - z_cast, incz ); \ - PASTEMAC3(chxy,chy,chz,axpyv)( conjy, \ - n, \ - alpha2_cast, \ - y_cast, incy, \ - z_cast, incz ); \ -} - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3_BASIC( axpy2v, axpy2v_opt_var1 ) -GENTFUNC3U12( float, float, float, float, s, s, s, s, axpy2v, axpy2v_opt_var1 ) -//GENTFUNC3U12( double, double, double, double, d, d, d, d, axpy2v, axpy2v_opt_var1 ) -GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, axpy2v, axpy2v_opt_var1 ) -GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, axpy2v, axpy2v_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( axpy2v, axpy2v_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( axpy2v, axpy2v_opt_var1 ) -#endif - #include "pmmintrin.h" typedef union @@ -158,16 +43,16 @@ typedef union } v2df_t; -void bli_dddaxpy2v_opt_var1( - conj_t conjx, - conj_t conjy, - dim_t n, - void* alpha, - void* beta, - void* x, inc_t incx, - void* y, inc_t incy, - void* z, inc_t incz - ) +void bli_daxpy2v_opt_var1( + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict alpha, + double* restrict beta, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict z, inc_t incz + ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; @@ -223,14 +108,14 @@ void bli_dddaxpy2v_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_dddaxpy2v_unb_var1( conjx, - conjy, - n, - alpha, - beta, - x, incx, - y, incy, - z, incz ); + BLIS_DAXPY2V_KERNEL_REF( conjx, + conjy, + n, + alpha, + beta, + x, incx, + y, incy, + z, incz ); return; } diff --git a/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.h b/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.h deleted file mode 100644 index 68c49d4f6..000000000 --- a/kernels/x86_64/core2-sse3/1f/bli_axpy2v_opt_var1.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* -void bli_axpy2v_opt_var1( obj_t* alpha, - obj_t* x, - obj_t* y ); -*/ - - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_z, chx, chy, chz, varname ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha1, \ - void* alpha2, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ); - -INSERT_GENTPROT3_BASIC( axpy2v_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3_MIX_D( axpy2v_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3_MIX_P( axpy2v_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.c b/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.c index bcd5c1e21..400c18236 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.c @@ -35,30 +35,6 @@ #include "blis.h" -void bli_sssaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_sssaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); -} - - - #include "pmmintrin.h" typedef union { @@ -66,16 +42,16 @@ typedef union double d[2]; } v2df_t; -void bli_dddaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict y, inc_t incy - ) +void bli_daxpyf_opt_var1( + conj_t conja, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy + ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; @@ -135,14 +111,14 @@ void bli_dddaxpyf_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - PASTEMAC3(d,d,d,axpyf_unb_var1)( conja, - conjx, - m, - b_n, - alpha_cast, - a_cast, inca, lda, - x_cast, incx, - y_cast, incy ); + BLIS_DAXPYF_KERNEL_REF( conja, + conjx, + m, + b_n, + alpha_cast, + a_cast, inca, lda, + x_cast, incx, + y_cast, incy ); return; } @@ -256,50 +232,3 @@ void bli_dddaxpyf_opt_var1( } } - - -void bli_cccaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict x, inc_t incx, - scomplex* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_cccaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); -} - - - -void bli_zzzaxpyf_opt_var1( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_zzzaxpyf_unb_var1( conja, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - y, incy ); -} diff --git a/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.h b/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.h deleted file mode 100644 index 539cd38ea..000000000 --- a/kernels/x86_64/core2-sse3/1f/bli_axpyf_opt_var1.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype axpyf kernel interfaces. -// -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( axpyf_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( axpyf_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( axpyf_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.c b/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.c index 74f16edb7..700e278e9 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.c @@ -35,62 +35,6 @@ #include "blis.h" -#undef GENTFUNC3U12 -#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, opname, varname ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho, \ - void* z, inc_t incz \ - ) \ -{ \ - ctype_xy* one = PASTEMAC(chxy,1); \ - ctype_xy* zero = PASTEMAC(chxy,0); \ - ctype_xy* alpha_cast = alpha; \ - ctype_x* x_cast = x; \ - ctype_y* y_cast = y; \ - ctype_xy* rho_cast = rho; \ - ctype_z* z_cast = z; \ -\ - PASTEMAC3(chx,chy,chxy,dotxv)( conjxt, \ - conjy, \ - n, \ - one, \ - x_cast, incx, \ - y_cast, incy, \ - zero, \ - rho_cast ); \ - PASTEMAC3(chxy,chx,chz,axpyv)( conjx, \ - n, \ - alpha_cast, \ - x_cast, incx, \ - z_cast, incz ); \ -} - -// Define the basic set of functions unconditionally, and then also some -// mixed datatype functions if requested. -//INSERT_GENTFUNC3U12_BASIC( dotaxpyv, dotaxpyv_opt_var1 ) -GENTFUNC3U12( float, float, float, float, s, s, s, s, dotaxpyv, dotaxpyv_opt_var1 ) -//GENTFUNC3U12( double, double, double, double, d, d, d, d, dotaxpyv, dotaxpyv_opt_var1 ) -GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, dotaxpyv, dotaxpyv_opt_var1 ) -GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, dotaxpyv, dotaxpyv_opt_var1 ) - - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTFUNC3U12_MIX_D( dotaxpyv, dotaxpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTFUNC3U12_MIX_P( dotaxpyv, dotaxpyv_opt_var1 ) -#endif - - #include "pmmintrin.h" typedef union { @@ -99,17 +43,17 @@ typedef union } v2df_t; -void bli_ddddotaxpyv_opt_var1( - conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy, - void* rho, - void* z, inc_t incz - ) +void bli_ddotaxpyv_opt_var1( + conj_t conjxt, + conj_t conjx, + conj_t conjy, + dim_t n, + double* restrict alpha, + double* restrict x, inc_t incx, + double* restrict y, inc_t incy, + double* restrict rho, + double* restrict z, inc_t incz + ) { double* restrict alpha_cast = alpha; double* restrict x_cast = x; @@ -166,15 +110,15 @@ void bli_ddddotaxpyv_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - bli_ddddotaxpyv_unb_var1( conjxt, - conjx, - conjy, - n, - alpha, - x, incx, - y, incy, - rho, - z, incz ); + BLIS_DDOTAXPYV_KERNEL_REF( conjxt, + conjx, + conjy, + n, + alpha, + x, incx, + y, incy, + rho, + z, incz ); return; } diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.h b/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.h deleted file mode 100644 index 3b35368cc..000000000 --- a/kernels/x86_64/core2-sse3/1f/bli_dotaxpyv_opt_var1.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname ) \ -\ -void PASTEMAC3(chx,chy,chz,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho, \ - void* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( dotaxpyv_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotaxpyv_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotaxpyv_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.c b/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.c index 21a401d9d..62020b8f6 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1f/bli_dotxaxpyf_opt_var1.c @@ -35,39 +35,6 @@ #include "blis.h" - -void bli_sssdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict w, inc_t incw, - float* restrict x, inc_t incx, - float* restrict beta, - float* restrict y, inc_t incy, - float* restrict z, inc_t incz ) -{ - /* Just call the reference implementation. */ - bli_sssdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); -} - - - #include "pmmintrin.h" typedef union { @@ -76,19 +43,19 @@ typedef union } v2df_t; -void bli_ddddotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict w, inc_t incw, - double* restrict x, inc_t incx, - double* restrict beta, - double* restrict y, inc_t incy, - double* restrict z, inc_t incz ) +void bli_ddotxaxpyf_opt_var1( conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict w, inc_t incw, + double* restrict x, inc_t incx, + double* restrict beta, + double* restrict y, inc_t incy, + double* restrict z, inc_t incz ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; @@ -171,19 +138,19 @@ void bli_ddddotxaxpyf_opt_var1( conj_t conjat, if ( use_ref == TRUE ) { - PASTEMAC3(d,d,d,dotxaxpyf_unb_var1)( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha_cast, - a_cast, inca, lda, - w_cast, incw, - x_cast, incx, - beta_cast, - y_cast, incy, - z_cast, incz ); + BLIS_DDOTXAXPYF_KERNEL_REF( conjat, + conja, + conjw, + conjx, + m, + b_n, + alpha_cast, + a_cast, inca, lda, + w_cast, incw, + x_cast, incx, + beta_cast, + y_cast, incy, + z_cast, incz ); return; } @@ -374,66 +341,3 @@ void bli_ddddotxaxpyf_opt_var1( conj_t conjat, _mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg ), psi1v.v ); } - - -void bli_cccdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict w, inc_t incw, - scomplex* restrict x, inc_t incx, - scomplex* restrict beta, - scomplex* restrict y, inc_t incy, - scomplex* restrict z, inc_t incz ) -{ - /* Just call the reference implementation. */ - bli_cccdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); -} - - - -void bli_zzzdotxaxpyf_opt_var1( conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict w, inc_t incw, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict beta, - dcomplex* restrict y, inc_t incy, - dcomplex* restrict z, inc_t incz ) -{ - /* Just call the reference implementation. */ - bli_zzzdotxaxpyf_unb_var1( conjat, - conja, - conjw, - conjx, - m, - b_n, - alpha, - a, inca, lda, - w, incw, - x, incx, - beta, - y, incy, - z, incz ); -} diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c b/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c index 72398ef4c..c8dd85ee6 100644 --- a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c +++ b/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.c @@ -35,32 +35,6 @@ #include "blis.h" -void bli_sssdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - float* restrict alpha, - float* restrict a, inc_t inca, inc_t lda, - float* restrict x, inc_t incx, - float* restrict beta, - float* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_sssdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); -} - - - #include "pmmintrin.h" typedef union { @@ -69,17 +43,17 @@ typedef union } v2df_t; -void bli_ddddotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - double* restrict alpha, - double* restrict a, inc_t inca, inc_t lda, - double* restrict x, inc_t incx, - double* restrict beta, - double* restrict y, inc_t incy - ) +void bli_ddotxf_opt_var1( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b_n, + double* restrict alpha, + double* restrict a, inc_t inca, inc_t lda, + double* restrict x, inc_t incx, + double* restrict beta, + double* restrict y, inc_t incy + ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; @@ -151,15 +125,15 @@ void bli_ddddotxf_opt_var1( // Call the reference implementation if needed. if ( use_ref == TRUE ) { - PASTEMAC3(d,d,d,dotxf_unb_var1)( conjat, - conjx, - m, - b_n, - alpha_cast, - a_cast, inca, lda, - x_cast, incx, - beta_cast, - y_cast, incy ); + BLIS_DDOTXF_KERNEL_REF( conjat, + conjx, + m, + b_n, + alpha_cast, + a_cast, inca, lda, + x_cast, incx, + beta_cast, + y_cast, incy ); return; } @@ -318,57 +292,3 @@ void bli_ddddotxf_opt_var1( } - - -void bli_cccdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - scomplex* restrict alpha, - scomplex* restrict a, inc_t inca, inc_t lda, - scomplex* restrict x, inc_t incx, - scomplex* restrict beta, - scomplex* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_cccdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); -} - - - -void bli_zzzdotxf_opt_var1( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - dcomplex* restrict alpha, - dcomplex* restrict a, inc_t inca, inc_t lda, - dcomplex* restrict x, inc_t incx, - dcomplex* restrict beta, - dcomplex* restrict y, inc_t incy - ) -{ - /* Just call the reference implementation. */ - bli_zzzdotxf_unb_var1( conjat, - conjx, - m, - b_n, - alpha, - a, inca, lda, - x, incx, - beta, - y, incy ); -} - - - diff --git a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.h b/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.h deleted file mode 100644 index 4e0445571..000000000 --- a/kernels/x86_64/core2-sse3/1f/bli_dotxf_opt_var1.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype dotxf kernel interfaces. -// -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname ) \ -\ -void PASTEMAC3(cha,chx,chy,varname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* restrict alpha, \ - ctype_a* restrict a, inc_t inca, inc_t lda, \ - ctype_x* restrict x, inc_t incx, \ - ctype_y* restrict beta, \ - ctype_y* restrict y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( dotxf_opt_var1 ) - -#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT -INSERT_GENTPROT3U12_MIX_D( dotxf_opt_var1 ) -#endif - -#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT -INSERT_GENTPROT3U12_MIX_P( dotxf_opt_var1 ) -#endif - diff --git a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c index bbf6d200a..c8c05958f 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_gemm_opt_d4x4.c @@ -34,17 +34,17 @@ #include "blis.h" -void bli_sgemm_opt_d4x4( - dim_t k, - float* restrict alpha, - float* restrict a, - float* restrict b, - float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_sgemm_opt_8x4( + dim_t k, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { - void* a_next = bli_auxinfo_next_a( data ); + //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); dim_t k_iter = k / 4; @@ -715,15 +715,15 @@ void bli_sgemm_opt_d4x4( ); } -void bli_dgemm_opt_d4x4( - dim_t k, - double* restrict alpha, - double* restrict a, - double* restrict b, - double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_dgemm_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); @@ -738,6 +738,7 @@ void bli_dgemm_opt_d4x4( "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. "movq %9, %%r9 \n\t" // load address of b_next. + "movq %10, %%r11 \n\t" // load address of a_next. " \n\t" "subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte "subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations. @@ -758,16 +759,16 @@ void bli_dgemm_opt_d4x4( "xorpd %%xmm5, %%xmm5 \n\t" "xorpd %%xmm6, %%xmm6 \n\t" " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c + "prefetcht2 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "xorpd %%xmm8, %%xmm8 \n\t" "movaps %%xmm8, %%xmm9 \n\t" - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "prefetcht2 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c "movaps %%xmm8, %%xmm10 \n\t" "movaps %%xmm8, %%xmm11 \n\t" - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c + "prefetcht2 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c "movaps %%xmm8, %%xmm12 \n\t" "movaps %%xmm8, %%xmm13 \n\t" - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c + "prefetcht2 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c "movaps %%xmm8, %%xmm14 \n\t" "movaps %%xmm8, %%xmm15 \n\t" " \n\t" @@ -782,6 +783,9 @@ void bli_dgemm_opt_d4x4( ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" "prefetcht0 (4*35+1) * 8(%%rax) \n\t" + //"prefetcht0 (8*97+4) * 8(%%rax) \n\t" + " \n\t" + //"prefetcht0 67*4 * 8(%%r11) \n\t" // prefetch a_next[0] " \n\t" "addpd %%xmm3, %%xmm11 \n\t" // iteration 0 "movaps -7 * 16(%%rbx), %%xmm3 \n\t" @@ -814,6 +818,7 @@ void bli_dgemm_opt_d4x4( "movaps -5 * 16(%%rax), %%xmm1 \n\t" " \n\t" " \n\t" + " \n\t" "addpd %%xmm3, %%xmm11 \n\t" // iteration 1 "movaps -5 * 16(%%rbx), %%xmm3 \n\t" "addpd %%xmm4, %%xmm15 \n\t" @@ -844,7 +849,14 @@ void bli_dgemm_opt_d4x4( "mulpd %%xmm1, %%xmm6 \n\t" "movaps -3 * 16(%%rax), %%xmm1 \n\t" " \n\t" + " \n\t" "prefetcht0 (4*37+1) * 8(%%rax) \n\t" + //"prefetcht0 (8*97+12)* 8(%%rax) \n\t" + " \n\t" + //"prefetcht0 69*4 * 8(%%r11) \n\t" // prefetch a_next[8] + //"subq $-4 * 4 * 8, %%r11 \n\t" // a_next += 4*4 (unroll x mr) + " \n\t" + " \n\t" " \n\t" "addpd %%xmm3, %%xmm11 \n\t" // iteration 2 "movaps -3 * 16(%%rbx), %%xmm3 \n\t" @@ -868,6 +880,7 @@ void bli_dgemm_opt_d4x4( "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm1, %%xmm4 \n\t" " \n\t" + " \n\t" "addpd %%xmm7, %%xmm8 \n\t" "addpd %%xmm6, %%xmm12 \n\t" "movaps %%xmm5, %%xmm6 \n\t" @@ -877,6 +890,7 @@ void bli_dgemm_opt_d4x4( "movaps -1 * 16(%%rax), %%xmm1 \n\t" " \n\t" " \n\t" + " \n\t" "addpd %%xmm3, %%xmm11 \n\t" // iteration 3 "movaps -1 * 16(%%rbx), %%xmm3 \n\t" "addpd %%xmm4, %%xmm15 \n\t" @@ -916,12 +930,15 @@ void bli_dgemm_opt_d4x4( "prefetcht2 0 * 8(%%r9) \n\t" // prefetch b_next[0] "prefetcht2 8 * 8(%%r9) \n\t" // prefetch b_next[8] " \n\t" - " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" + //"prefetcht2 -8 * 8(%%r9) \n\t" // prefetch b_next[-8] + " \n\t" + " \n\t" + " \n\t" ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; @@ -1310,18 +1327,19 @@ void bli_dgemm_opt_d4x4( : // output operands (none) : // input operands - "m" (k_iter), - "m" (k_left), - "m" (a), - "m" (b), - "m" (alpha), - "m" (beta), - "m" (c), - "m" (rs_c), - "m" (cs_c), - "m" (b_next) + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c), // 8 + "m" (b_next), // 9 + "m" (a_next) // 10 : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", @@ -1330,18 +1348,18 @@ void bli_dgemm_opt_d4x4( ); } -void bli_cgemm_opt_d4x4( - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a, - scomplex* restrict b, - scomplex* restrict beta, - scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_cgemm_opt_4x2( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -1350,18 +1368,20 @@ void bli_cgemm_opt_d4x4( data ); } -void bli_zgemm_opt_d4x4( - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a, - dcomplex* restrict b, - dcomplex* restrict beta, - dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) + + +void bli_zgemm_opt_2x2( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c index d0a6c361e..5f65fcde5 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_l_opt_d4x4.c @@ -34,19 +34,19 @@ #include "blis.h" -void bli_sgemmtrsm_l_opt_d4x4( - dim_t k, - float* restrict alpha, - float* restrict a10, - float* restrict a11, - float* restrict b01, - float* restrict b11, - float* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_sgemmtrsm_l_opt_8x4( + dim_t k, + float* restrict alpha, + float* restrict a10, + float* restrict a11, + float* restrict b01, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_sgemmtrsm_l_ref_mxn( k, + BLIS_SGEMMTRSM_L_UKERNEL_REF( k, alpha, a10, a11, @@ -56,16 +56,16 @@ void bli_sgemmtrsm_l_opt_d4x4( data ); } -void bli_dgemmtrsm_l_opt_d4x4( - dim_t k, - double* restrict alpha, - double* restrict a10, - double* restrict a11, - double* restrict b01, - double* restrict b11, - double* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_dgemmtrsm_l_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a10, + double* restrict a11, + double* restrict b01, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { void* b_next = bli_auxinfo_next_b( data ); @@ -539,19 +539,19 @@ void bli_dgemmtrsm_l_opt_d4x4( } -void bli_cgemmtrsm_l_opt_d4x4( - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a10, - scomplex* restrict a11, - scomplex* restrict b01, - scomplex* restrict b11, - scomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_cgemmtrsm_l_opt_4x2( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a10, + scomplex* restrict a11, + scomplex* restrict b01, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_cgemmtrsm_l_ref_mxn( k, + BLIS_CGEMMTRSM_L_UKERNEL_REF( k, alpha, a10, a11, @@ -561,19 +561,19 @@ void bli_cgemmtrsm_l_opt_d4x4( data ); } -void bli_zgemmtrsm_l_opt_d4x4( - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a10, - dcomplex* restrict a11, - dcomplex* restrict b01, - dcomplex* restrict b11, - dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_zgemmtrsm_l_opt_2x2( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a10, + dcomplex* restrict a11, + dcomplex* restrict b01, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_zgemmtrsm_l_ref_mxn( k, + BLIS_ZGEMMTRSM_L_UKERNEL_REF( k, alpha, a10, a11, diff --git a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c index 322fc94da..5233b0321 100644 --- a/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_gemmtrsm_u_opt_d4x4.c @@ -34,19 +34,19 @@ #include "blis.h" -void bli_sgemmtrsm_u_opt_d4x4( - dim_t k, - float* restrict alpha, - float* restrict a12, - float* restrict a11, - float* restrict b21, - float* restrict b11, - float* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_sgemmtrsm_u_opt_8x4( + dim_t k, + float* restrict alpha, + float* restrict a12, + float* restrict a11, + float* restrict b21, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_sgemmtrsm_u_ref_mxn( k, + BLIS_SGEMMTRSM_U_UKERNEL_REF( k, alpha, a12, a11, @@ -56,16 +56,16 @@ void bli_sgemmtrsm_u_opt_d4x4( data ); } -void bli_dgemmtrsm_u_opt_d4x4( - dim_t k, - double* restrict alpha, - double* restrict a12, - double* restrict a11, - double* restrict b21, - double* restrict b11, - double* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_dgemmtrsm_u_opt_4x4( + dim_t k, + double* restrict alpha, + double* restrict a12, + double* restrict a11, + double* restrict b21, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { void* b_next = bli_auxinfo_next_b( data ); @@ -526,19 +526,19 @@ void bli_dgemmtrsm_u_opt_d4x4( } -void bli_cgemmtrsm_u_opt_d4x4( - dim_t k, - scomplex* restrict alpha, - scomplex* restrict a12, - scomplex* restrict a11, - scomplex* restrict b21, - scomplex* restrict b11, - scomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_cgemmtrsm_u_opt_4x2( + dim_t k, + scomplex* restrict alpha, + scomplex* restrict a12, + scomplex* restrict a11, + scomplex* restrict b21, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_cgemmtrsm_u_ref_mxn( k, + BLIS_CGEMMTRSM_U_UKERNEL_REF( k, alpha, a12, a11, @@ -548,19 +548,19 @@ void bli_cgemmtrsm_u_opt_d4x4( data ); } -void bli_zgemmtrsm_u_opt_d4x4( - dim_t k, - dcomplex* restrict alpha, - dcomplex* restrict a12, - dcomplex* restrict a11, - dcomplex* restrict b21, - dcomplex* restrict b11, - dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_zgemmtrsm_u_opt_2x2( + dim_t k, + dcomplex* restrict alpha, + dcomplex* restrict a12, + dcomplex* restrict a11, + dcomplex* restrict b21, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_zgemmtrsm_u_ref_mxn( k, + BLIS_ZGEMMTRSM_U_UKERNEL_REF( k, alpha, a12, a11, diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c index 302a6d7f6..62fe27f57 100644 --- a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.c @@ -34,26 +34,26 @@ #include "blis.h" -void bli_strsm_l_opt_d4x4( - float* restrict a11, - float* restrict b11, - float* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_strsm_l_opt_8x4( + float* restrict a11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_strsm_l_ref_mxn( a11, + BLIS_STRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); } -void bli_dtrsm_l_opt_d4x4( - double* restrict a11, - double* restrict b11, - double* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_dtrsm_l_opt_4x4( + double* restrict a11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { __asm__ volatile ( @@ -208,29 +208,29 @@ void bli_dtrsm_l_opt_d4x4( } -void bli_ctrsm_l_opt_d4x4( - scomplex* restrict a11, - scomplex* restrict b11, - scomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_ctrsm_l_opt_4x2( + scomplex* restrict a11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_ctrsm_l_ref_mxn( a11, + BLIS_CTRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); } -void bli_ztrsm_l_opt_d4x4( - dcomplex* restrict a11, - dcomplex* restrict b11, - dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_ztrsm_l_opt_2x2( + dcomplex* restrict a11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_ztrsm_l_ref_mxn( a11, + BLIS_ZTRSM_L_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h b/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h deleted file mode 100644 index adc3fd60f..000000000 --- a/kernels/x86_64/core2-sse3/3/bli_trsm_l_opt_d4x4.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - ctype* restrict a11, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_l_opt_d4x4 ) - diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c index f22acfbb1..ca6ee92ea 100644 --- a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c +++ b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.c @@ -34,26 +34,26 @@ #include "blis.h" -void bli_strsm_u_opt_d4x4( - float* restrict a11, - float* restrict b11, - float* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_strsm_u_opt_8x4( + float* restrict a11, + float* restrict b11, + float* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_strsm_u_ref_mxn( a11, + BLIS_STRSM_U_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); } -void bli_dtrsm_u_opt_d4x4( - double* restrict a11, - double* restrict b11, - double* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_dtrsm_u_opt_4x4( + double* restrict a11, + double* restrict b11, + double* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { __asm__ volatile ( @@ -211,29 +211,29 @@ void bli_dtrsm_u_opt_d4x4( } -void bli_ctrsm_u_opt_d4x4( - scomplex* restrict a11, - scomplex* restrict b11, - scomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_ctrsm_u_opt_4x2( + scomplex* restrict a11, + scomplex* restrict b11, + scomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_ctrsm_u_ref_mxn( a11, + BLIS_CTRSM_U_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); } -void bli_ztrsm_u_opt_d4x4( - dcomplex* restrict a11, - dcomplex* restrict b11, - dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, - auxinfo_t* data - ) +void bli_ztrsm_u_opt_2x2( + dcomplex* restrict a11, + dcomplex* restrict b11, + dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, + auxinfo_t* data + ) { /* Just call the reference implementation. */ - bli_ztrsm_u_ref_mxn( a11, + BLIS_ZTRSM_U_UKERNEL_REF( a11, b11, c11, rs_c, cs_c, data ); diff --git a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h b/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h deleted file mode 100644 index cfe67964a..000000000 --- a/kernels/x86_64/core2-sse3/3/bli_trsm_u_opt_d4x4.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - ctype* restrict a11, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_u_opt_d4x4 ) - diff --git a/kernels/x86_64/piledriver/3/bli_gemm_4x6.c b/kernels/x86_64/piledriver/3/bli_gemm_4x6.c index 71f5ba364..5d88073fe 100644 --- a/kernels/x86_64/piledriver/3/bli_gemm_4x6.c +++ b/kernels/x86_64/piledriver/3/bli_gemm_4x6.c @@ -47,7 +47,7 @@ void bli_sgemm_4x6( ) { /* Just call the reference implementation. */ - bli_sgemm_ref_mxn( k, + BLIS_SGEMM_UKERNEL_REF( k, alpha, a, b, @@ -682,7 +682,7 @@ void bli_cgemm_4x6( ) { /* Just call the reference implementation. */ - bli_cgemm_ref_mxn( k, + BLIS_CGEMM_UKERNEL_REF( k, alpha, a, b, @@ -702,7 +702,7 @@ void bli_zgemm_4x6( ) { /* Just call the reference implementation. */ - bli_zgemm_ref_mxn( k, + BLIS_ZGEMM_UKERNEL_REF( k, alpha, a, b, diff --git a/test/test_gemm.c b/test/test_gemm.c index 34a791362..ad2a63a7b 100644 --- a/test/test_gemm.c +++ b/test/test_gemm.c @@ -68,22 +68,27 @@ int main( int argc, char** argv ) m_input = -1; n_input = -1; + //k_input = 256; k_input = -1; #else p_begin = 16; p_end = 16; p_inc = 1; - m_input = 8; - k_input = 16; - n_input = 16; + m_input = 5; + k_input = 6; + n_input = 4; #endif +#if 1 dt_a = BLIS_DOUBLE; dt_b = BLIS_DOUBLE; dt_c = BLIS_DOUBLE; dt_alpha = BLIS_DOUBLE; dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif for ( p = p_begin; p <= p_end; p += p_inc ) { @@ -109,8 +114,8 @@ int main( int argc, char** argv ) bli_randm( &c ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( -(1.0/1.0), 0.0, &beta ); + bli_setsc( (0.9/1.0), 0.2, &alpha ); + bli_setsc( -(1.1/1.0), 0.3, &beta ); bli_copym( &c, &c_save ); @@ -135,13 +140,15 @@ int main( int argc, char** argv ) //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); bli_gemm( &alpha, + //bli_gemm4m( &alpha, &a, &b, &beta, &c ); #else - + if ( bli_is_real( dt_a ) ) + { f77_char transa = 'N'; f77_char transb = 'N'; f77_int mm = bli_obj_length( c ); @@ -166,6 +173,35 @@ int main( int argc, char** argv ) bp, &ldb, betap, cp, &ldc ); + } + else + { + f77_char transa = 'N'; + f77_char transb = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int kk = bli_obj_width_after_trans( a ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldb = bli_obj_col_stride( b ); + f77_int ldc = bli_obj_col_stride( c ); + dcomplex* alphap = bli_obj_buffer( alpha ); + dcomplex* ap = bli_obj_buffer( a ); + dcomplex* bp = bli_obj_buffer( b ); + dcomplex* betap = bli_obj_buffer( beta ); + dcomplex* cp = bli_obj_buffer( c ); + + zgemm_( &transa, + //zgemm3m_( &transa, + &transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } #endif #ifdef PRINT @@ -179,6 +215,8 @@ int main( int argc, char** argv ) gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + #ifdef BLIS printf( "data_gemm_blis" ); #else diff --git a/test/test_hemm.c b/test/test_hemm.c index 98df6f683..c2d96dbc3 100644 --- a/test/test_hemm.c +++ b/test/test_hemm.c @@ -75,15 +75,19 @@ int main( int argc, char** argv ) p_end = 16; p_inc = 1; - m_input = 12; - n_input = 12; + m_input = 4; + n_input = 4; #endif +#if 1 dt_a = BLIS_DOUBLE; dt_b = BLIS_DOUBLE; dt_c = BLIS_DOUBLE; dt_alpha = BLIS_DOUBLE; dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif side = BLIS_LEFT; //side = BLIS_RIGHT; @@ -124,9 +128,19 @@ int main( int argc, char** argv ) bli_randm( &a ); bli_mkherm( &a ); bli_mktrim( &a ); +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( 1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( -1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + bli_scalm( &BLIS_TWO, &a ); +*/ - bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (2.0/1.0), 1.0, &alpha ); bli_setsc( -(1.0/1.0), 0.0, &beta ); @@ -142,6 +156,16 @@ int main( int argc, char** argv ) dtime = bli_clock(); #ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + bli_printm( "a", &a, "%4.1f", "" ); bli_printm( "b", &b, "%4.1f", "" ); bli_printm( "c", &c, "%4.1f", "" ); @@ -152,6 +176,7 @@ int main( int argc, char** argv ) //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); bli_hemm( side, + //bli_hemm4m( side, &alpha, &a, &b, @@ -184,7 +209,7 @@ int main( int argc, char** argv ) #endif #ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); + bli_printm( "c after", &c, "%9.5f", "" ); exit(1); #endif @@ -196,6 +221,8 @@ int main( int argc, char** argv ) else gflops = ( 2.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + #ifdef BLIS printf( "data_hemm_blis" ); #else diff --git a/test/test_trmm.c b/test/test_trmm.c index fbfc524b9..d43e8ba38 100644 --- a/test/test_trmm.c +++ b/test/test_trmm.c @@ -75,20 +75,25 @@ int main( int argc, char** argv ) p_end = 16; p_inc = 1; - m_input = 8; - n_input = 4; + m_input = 4; + n_input = 6; #endif +#if 1 dt_a = BLIS_DOUBLE; dt_b = BLIS_DOUBLE; dt_c = BLIS_DOUBLE; dt_alpha = BLIS_DOUBLE; dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_DCOMPLEX; +#endif side = BLIS_LEFT; //side = BLIS_RIGHT; uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; for ( p = p_begin; p <= p_end; p += p_inc ) { @@ -117,6 +122,18 @@ int main( int argc, char** argv ) bli_randm( &c ); bli_randm( &b ); +/* + bli_obj_toggle_uplo( a ); + bli_obj_inc_diag_off( -1, a ); + bli_setm( &BLIS_ZERO, &a ); + bli_obj_inc_diag_off( 1, a ); + bli_obj_toggle_uplo( a ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a ); + bli_scalm( &BLIS_TWO, &a ); + //bli_scalm( &BLIS_TWO, &a ); +*/ + + bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_setsc( -(1.0/1.0), 0.0, &beta ); @@ -134,14 +151,25 @@ int main( int argc, char** argv ) #ifdef PRINT - bli_printm( "a", &a, "%11.8f", "" ); - bli_printm( "c", &c, "%14.11f", "" ); + +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); #endif #ifdef BLIS - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); bli_trmm( side, + //bli_trmm4m( side, &alpha, &a, &c ); @@ -172,7 +200,7 @@ int main( int argc, char** argv ) #endif #ifdef PRINT - bli_printm( "c after", &c, "%14.11f", "" ); + bli_printm( "c after", &c, "%4.1f", "" ); exit(1); #endif @@ -186,6 +214,8 @@ int main( int argc, char** argv ) else gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + #ifdef BLIS printf( "data_trmm_blis" ); #else diff --git a/test/test_trsm.c b/test/test_trsm.c index 380165c7e..c0c9cd73c 100644 --- a/test/test_trsm.c +++ b/test/test_trsm.c @@ -64,8 +64,8 @@ int main( int argc, char** argv ) n_repeats = 3; #ifndef PRINT - p_begin = 40; - p_end = 2000; + p_begin = 1000; + p_end = 1000; p_inc = 40; m_input = -1; @@ -75,20 +75,26 @@ int main( int argc, char** argv ) p_end = 16; p_inc = 1; - m_input = 7 ; - n_input = 7 ; + m_input = 8 ; + n_input = 4 ; #endif +#if 0 dt_a = BLIS_DOUBLE; dt_b = BLIS_DOUBLE; dt_c = BLIS_DOUBLE; dt_alpha = BLIS_DOUBLE; dt_beta = BLIS_DOUBLE; +#else + dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_FLOAT; + //dt_a = dt_b = dt_c = dt_alpha = dt_beta = BLIS_SCOMPLEX; +#endif side = BLIS_LEFT; //side = BLIS_RIGHT; uplo = BLIS_LOWER; + //uplo = BLIS_UPPER; for ( p = p_begin; p <= p_end; p += p_inc ) { @@ -112,11 +118,27 @@ int main( int argc, char** argv ) bli_obj_set_struc( BLIS_TRIANGULAR, a ); bli_obj_set_uplo( uplo, a ); + //bli_obj_set_diag( BLIS_UNIT_DIAG, a ); bli_randm( &a ); bli_randm( &c ); bli_randm( &b ); +/* + { + obj_t a2; + + bli_obj_alias_to( a, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_inc_diag_off( 1, a2 ); + bli_setm( &BLIS_ZERO, &a2 ); + bli_obj_inc_diag_off( -2, a2 ); + bli_obj_toggle_uplo( a2 ); + bli_obj_set_diag( BLIS_NONUNIT_DIAG, a2 ); + bli_scalm( &BLIS_TWO, &a2 ); + //bli_scalm( &BLIS_TWO, &a ); + } +*/ bli_setsc( (2.0/1.0), 0.0, &alpha ); bli_setsc( -(1.0/1.0), 0.0, &beta ); @@ -134,6 +156,17 @@ int main( int argc, char** argv ) #ifdef PRINT +/* + obj_t ar, ai; + bli_obj_alias_to( a, ar ); + bli_obj_alias_to( a, ai ); + bli_obj_set_datatype( BLIS_DOUBLE, ar ); ar.rs *= 2; ar.cs *= 2; + bli_obj_set_datatype( BLIS_DOUBLE, ai ); ai.rs *= 2; ai.cs *= 2; ai.buffer = ( double* )ai.buffer + 1; + + bli_printm( "ar", &ar, "%4.1f", "" ); + bli_printm( "ai", &ai, "%4.1f", "" ); +*/ + bli_invertd( &a ); bli_printm( "a", &a, "%4.1f", "" ); bli_invertd( &a ); @@ -144,11 +177,15 @@ int main( int argc, char** argv ) //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); bli_trsm( side, + //bli_trsm4m( side, + //bli_trsm3m( side, &alpha, &a, &c ); #else + if ( bli_is_real( dt_a ) ) + { f77_char side = 'L'; f77_char uplo = 'L'; f77_char transa = 'N'; @@ -157,11 +194,11 @@ int main( int argc, char** argv ) f77_int nn = bli_obj_width( c ); f77_int lda = bli_obj_col_stride( a ); f77_int ldc = bli_obj_col_stride( c ); - double* alphap = bli_obj_buffer( alpha ); - double* ap = bli_obj_buffer( a ); - double* cp = bli_obj_buffer( c ); + float * alphap = bli_obj_buffer( alpha ); + float * ap = bli_obj_buffer( a ); + float * cp = bli_obj_buffer( c ); - dtrsm_( &side, + strsm_( &side, &uplo, &transa, &diag, @@ -170,6 +207,33 @@ int main( int argc, char** argv ) alphap, ap, &lda, cp, &ldc ); + } + else // if ( bli_is_complex( dt_a ) ) + { + f77_char side = 'L'; + f77_char uplo = 'L'; + f77_char transa = 'N'; + f77_char diag = 'N'; + f77_int mm = bli_obj_length( c ); + f77_int nn = bli_obj_width( c ); + f77_int lda = bli_obj_col_stride( a ); + f77_int ldc = bli_obj_col_stride( c ); + scomplex* alphap = bli_obj_buffer( alpha ); + scomplex* ap = bli_obj_buffer( a ); + scomplex* cp = bli_obj_buffer( c ); + + ctrsm_( &side, + //ztrsm_( &side, + &uplo, + &transa, + &diag, + &mm, + &nn, + alphap, + ap, &lda, + cp, &ldc ); + } + #endif #ifdef PRINT @@ -186,6 +250,8 @@ int main( int argc, char** argv ) else gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 ); + if ( bli_is_complex( dt_a ) ) gflops *= 4.0; + #ifdef BLIS printf( "data_trsm_blis" ); #else diff --git a/testsuite/input.operations b/testsuite/input.operations index 6a508c814..094287382 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -290,37 +290,37 @@ 1 # gemm 1 # test sequential front-end --1 -1 -2 # dimensions: m n k +-1 -1 -1 # dimensions: m n k ?? # parameters: transa transb 1 # hemm 1 # test sequential front-end --1 -2 # dimensions: m n +-1 -1 # dimensions: m n ???? # parameters: side uploa conja transb 1 # herk 1 # test sequential front-end --1 -2 # dimensions: m k +-1 -1 # dimensions: m k ?? # parameters: uploc transa 1 # her2k 1 # test sequential front-end --1 -2 # dimensions: m k +-1 -1 # dimensions: m k ??? # parameters: uploc transa transb 1 # symm 1 # test sequential front-end --1 -2 # dimensions: m n +-1 -1 # dimensions: m n ???? # parameters: side uploa conja transb 1 # syrk 1 # test sequential front-end --1 -2 # dimensions: m k +-1 -1 # dimensions: m k ?? # parameters: uploc transa 1 # syr2k 1 # test sequential front-end --1 -2 # dimensions: m k +-1 -1 # dimensions: m k ??? # parameters: uploc transa transb 1 # trmm diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index e8340940e..255593ec7 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -221,7 +221,7 @@ void libblis_test_axpy2v_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_axpy2v_ker( alpha1, alpha2, x, y, z ); + bli_axpy2v_kernel( alpha1, alpha2, x, y, z ); break; default: @@ -293,93 +293,3 @@ void libblis_test_axpy2v_check( obj_t* alpha1, bli_obj_free( &z_temp ); } - - - -// -// Define object-wrapper to AXPY2V_KERNEL kernels. -// -#define FUNCPTR_T axpy2v_ker_fp - -typedef void (*FUNCPTR_T)( - conj_t conjx, - conj_t conjy, - dim_t n, - void* alpha1, - void* alpha2, - void* x, inc_t incx, - void* y, inc_t incy, - void* z, inc_t incz - ); - -static FUNCPTR_T GENARRAY(ftypes,axpy2v_ker); - -void bli_axpy2v_ker( obj_t* alpha1, - obj_t* alpha2, - obj_t* x, - obj_t* y, - obj_t* z ) -{ - num_t dt = bli_obj_datatype( *z ); - - conj_t conjx = bli_obj_conj_status( *x ); - conj_t conjy = bli_obj_conj_status( *y ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - inc_t inc_z = bli_obj_vector_inc( *z ); - void* buf_z = bli_obj_buffer_at_off( *z ); - - void* buf_alpha1 = bli_obj_buffer_for_1x1( dt, *alpha1 ); - - void* buf_alpha2 = bli_obj_buffer_for_1x1( dt, *alpha2 ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( conjx, - conjy, - n, - buf_alpha1, - buf_alpha2, - buf_x, inc_x, - buf_y, inc_y, - buf_z, inc_z ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha1, \ - void* alpha2, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ -{ \ - PASTEMAC3(ch,ch,ch,kername)( conjx, \ - conjy, \ - n, \ - alpha1, \ - alpha2, \ - x, incx, \ - y, incy, \ - z, incz ); \ -} - -INSERT_GENTFUNC_BASIC( axpy2v_ker, AXPY2V_KERNEL ) - diff --git a/testsuite/src/test_axpy2v.h b/testsuite/src/test_axpy2v.h index 8045a70f0..0e01a597b 100644 --- a/testsuite/src/test_axpy2v.h +++ b/testsuite/src/test_axpy2v.h @@ -34,29 +34,3 @@ void libblis_test_axpy2v( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to kernel. -// -void bli_axpy2v_ker( obj_t* alpha1, - obj_t* alpha2, - obj_t* x, - obj_t* y, - obj_t* z ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname)( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - void* alpha1, \ - void* alpha2, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ); - -INSERT_GENTPROT_BASIC( axpy2v_ker ) - diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 2336145ac..dd9948682 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -223,7 +223,7 @@ void libblis_test_axpyf_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_axpyf_ker( alpha, a, x, y ); + bli_axpyf_kernel( alpha, a, x, y ); break; default: @@ -300,93 +300,3 @@ void libblis_test_axpyf_check( obj_t* alpha, bli_obj_free( &v ); } - - - -// -// Define object-wrapper to AXPYF_KERNEL kernels. -// -#define FUNCPTR_T axpyf_ker_fp - -typedef void (*FUNCPTR_T)( - conj_t conja, - conj_t conjx, - dim_t m, - dim_t b_n, - void* alpha, - void* a, inc_t inca, inc_t lda, - void* x, inc_t incx, - void* y, inc_t incy - ); - -static FUNCPTR_T GENARRAY(ftypes,axpyf_ker); - -void bli_axpyf_ker( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* y ) -{ - num_t dt = bli_obj_datatype( *a ); - - conj_t conja = bli_obj_conj_status( *a ); - conj_t conjx = bli_obj_conj_status( *x ); - - dim_t m = bli_obj_vector_dim( *y ); - dim_t b_n = bli_obj_vector_dim( *x ); - - void* buf_a = bli_obj_buffer_at_off( *a ); - inc_t rs_a = bli_obj_row_stride( *a ); - inc_t cs_a = bli_obj_col_stride( *a ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( conja, - conjx, - m, - b_n, - buf_alpha, - buf_a, rs_a, cs_a, - buf_x, inc_x, - buf_y, inc_y ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ) \ -{ \ - PASTEMAC3(ch,ch,ch,kername)( conja, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - y, incy ); \ -} - -INSERT_GENTFUNC_BASIC( axpyf_ker, AXPYF_KERNEL ) - diff --git a/testsuite/src/test_axpyf.h b/testsuite/src/test_axpyf.h index 6577edb77..4c74caa17 100644 --- a/testsuite/src/test_axpyf.h +++ b/testsuite/src/test_axpyf.h @@ -34,28 +34,3 @@ void libblis_test_axpyf( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to kernel. -// -void bli_axpyf_ker( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* y ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname)( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* y, inc_t incy \ - ); - -INSERT_GENTPROT_BASIC( axpyf_ker ) - diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index 7df0ff4b0..e25bd1b6a 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -242,7 +242,7 @@ void libblis_test_dotaxpyv_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_dotaxpyv_ker( alpha, xt, x, y, rho, z ); + bli_dotaxpyv_kernel( alpha, xt, x, y, rho, z ); break; default: @@ -324,99 +324,3 @@ void libblis_test_dotaxpyv_check( obj_t* alpha, bli_obj_free( &z_temp ); } - - - -// -// Define object-wrapper to DOTAXPYV_KERNEL kernels. -// -#define FUNCPTR_T dotaxpyv_ker_fp - -typedef void (*FUNCPTR_T)( - conj_t conjxt, - conj_t conjx, - conj_t conjy, - dim_t n, - void* alpha, - void* x, inc_t incx, - void* y, inc_t incy, - void* rho, - void* z, inc_t incz - ); - -static FUNCPTR_T GENARRAY(ftypes,dotaxpyv_ker); - -void bli_dotaxpyv_ker( obj_t* alpha, - obj_t* xt, - obj_t* x, - obj_t* y, - obj_t* rho, - obj_t* z ) -{ - num_t dt = bli_obj_datatype( *z ); - - conj_t conjxt = bli_obj_conj_status( *xt ); - conj_t conjx = bli_obj_conj_status( *x ); - conj_t conjy = bli_obj_conj_status( *y ); - dim_t n = bli_obj_vector_dim( *x ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - inc_t inc_z = bli_obj_vector_inc( *z ); - void* buf_z = bli_obj_buffer_at_off( *z ); - - void* buf_rho = bli_obj_buffer_at_off( *rho ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( conjxt, - conjx, - conjy, - n, - buf_alpha, - buf_x, inc_x, - buf_y, inc_y, - buf_rho, - buf_z, inc_z ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho, \ - void* z, inc_t incz \ - ) \ -{ \ - PASTEMAC3(ch,ch,ch,kername)( conjxt, \ - conjx, \ - conjy, \ - m, \ - alpha, \ - x, incx, \ - y, incy, \ - rho, \ - z, incz ); \ -} - -INSERT_GENTFUNC_BASIC( dotaxpyv_ker, DOTAXPYV_KERNEL ) - diff --git a/testsuite/src/test_dotaxpyv.h b/testsuite/src/test_dotaxpyv.h index 53039a9bc..0b039d7d7 100644 --- a/testsuite/src/test_dotaxpyv.h +++ b/testsuite/src/test_dotaxpyv.h @@ -34,31 +34,3 @@ void libblis_test_dotaxpyv( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to kernel. -// -void bli_dotaxpyv_ker( obj_t* alpha, - obj_t* xt, - obj_t* x, - obj_t* y, - obj_t* rho, - obj_t* z ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname)( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - void* alpha, \ - void* x, inc_t incx, \ - void* y, inc_t incy, \ - void* rho, \ - void* z, inc_t incz \ - ); - -INSERT_GENTPROT_BASIC( dotaxpyv_ker ) - diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index 6b72e65d1..ff1ac239a 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -260,7 +260,7 @@ void libblis_test_dotxaxpyf_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_dotxaxpyf_ker( alpha, at, a, w, x, beta, y, z ); + bli_dotxaxpyf_kernel( alpha, at, a, w, x, beta, y, z ); break; default: @@ -372,128 +372,3 @@ void libblis_test_dotxaxpyf_check( obj_t* alpha, bli_obj_free( &q ); } - - - -// -// Define object-wrapper to DOTXAXPYF_KERNEL kernels. -// -#define FUNCPTR_T dotxaxpyf_ker_fp - -typedef void (*FUNCPTR_T)( - conj_t conjat, - conj_t conja, - conj_t conjw, - conj_t conjx, - dim_t m, - dim_t b_n, - void* alpha, - void* a, inc_t inca, inc_t lda, - void* w, inc_t incw, - void* x, inc_t incx, - void* beta, - void* y, inc_t incy, - void* z, inc_t incz - ); - -static FUNCPTR_T GENARRAY(ftypes,dotxaxpyf_ker); - -void bli_dotxaxpyf_ker( obj_t* alpha, - obj_t* at, - obj_t* a, - obj_t* w, - obj_t* x, - obj_t* beta, - obj_t* y, - obj_t* z ) -{ - num_t dt = bli_obj_datatype( *z ); - - conj_t conjat = bli_obj_conj_status( *at ); - conj_t conja = bli_obj_conj_status( *a ); - conj_t conjw = bli_obj_conj_status( *w ); - conj_t conjx = bli_obj_conj_status( *x ); - - dim_t m = bli_obj_vector_dim( *z ); - dim_t b_n = bli_obj_vector_dim( *y ); - - void* buf_a = bli_obj_buffer_at_off( *a ); - inc_t rs_a = bli_obj_row_stride( *a ); - inc_t cs_a = bli_obj_col_stride( *a ); - - inc_t inc_w = bli_obj_vector_inc( *w ); - void* buf_w = bli_obj_buffer_at_off( *w ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - inc_t inc_z = bli_obj_vector_inc( *z ); - void* buf_z = bli_obj_buffer_at_off( *z ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha );; - - void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta );; - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( conjat, - conja, - conjw, - conjx, - m, - b_n, - buf_alpha, - buf_a, rs_a, cs_a, - buf_w, inc_w, - buf_x, inc_x, - buf_beta, - buf_y, inc_y, - buf_z, inc_z ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* w, inc_t incw, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ) \ -{ \ - PASTEMAC3(ch,ch,ch,kername)( conjat, \ - conja, \ - conjw, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - w, incw, \ - x, incx, \ - beta, \ - y, incy, \ - z, incz ); \ -} - -INSERT_GENTFUNC_BASIC( dotxaxpyf_ker, DOTXAXPYF_KERNEL ) - - diff --git a/testsuite/src/test_dotxaxpyf.h b/testsuite/src/test_dotxaxpyf.h index adc7cb8e2..e5726d27d 100644 --- a/testsuite/src/test_dotxaxpyf.h +++ b/testsuite/src/test_dotxaxpyf.h @@ -34,37 +34,3 @@ void libblis_test_dotxaxpyf( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to kernel. -// -void bli_dotxaxpyf_ker( obj_t* alpha, - obj_t* at, - obj_t* a, - obj_t* w, - obj_t* x, - obj_t* beta, - obj_t* y, - obj_t* z ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname)( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* w, inc_t incw, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy, \ - void* z, inc_t incz \ - ); - -INSERT_GENTPROT_BASIC( dotxaxpyf_ker ) - diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 966febd9c..7c545e86f 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -229,7 +229,7 @@ void libblis_test_dotxf_impl( iface_t iface, switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: - bli_dotxf_ker( alpha, a, x, beta, y ); + bli_dotxf_kernel( alpha, a, x, beta, y ); break; default: @@ -301,100 +301,3 @@ void libblis_test_dotxf_check( obj_t* alpha, bli_obj_free( &v ); } - - - -// -// Define object-wrapper to DOTXF_KERNEL kernels. -// -#define FUNCPTR_T dotxf_ker_fp - -typedef void (*FUNCPTR_T)( - conj_t conjat, - conj_t conjx, - dim_t m, - dim_t b_n, - void* alpha, - void* a, inc_t inca, inc_t lda, - void* x, inc_t incx, - void* beta, - void* y, inc_t incy - ); - -static FUNCPTR_T GENARRAY(ftypes,dotxf_ker); - -void bli_dotxf_ker( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y ) -{ - num_t dt = bli_obj_datatype( *y ); - - conj_t conjat = bli_obj_conj_status( *a ); - conj_t conjx = bli_obj_conj_status( *x ); - - dim_t m = bli_obj_vector_dim( *x ); - dim_t b_n = bli_obj_vector_dim( *y ); - - void* buf_a = bli_obj_buffer_at_off( *a ); - inc_t rs_a = bli_obj_row_stride( *a ); - inc_t cs_a = bli_obj_col_stride( *a ); - - inc_t inc_x = bli_obj_vector_inc( *x ); - void* buf_x = bli_obj_buffer_at_off( *x ); - - inc_t inc_y = bli_obj_vector_inc( *y ); - void* buf_y = bli_obj_buffer_at_off( *y ); - - void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); - - void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); - - FUNCPTR_T f; - - // Index into the type combination array to extract the correct - // function pointer. - f = ftypes[dt]; - - // Invoke the function. - f( conjat, - conjx, - m, - b_n, - buf_alpha, - buf_a, rs_a, cs_a, - buf_x, inc_x, - buf_beta, - buf_y, inc_y ); -} - - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, varname, kername ) \ -\ -void PASTEMAC(ch,varname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy \ - ) \ -{ \ - PASTEMAC3(ch,ch,ch,kername)( conjat, \ - conjx, \ - m, \ - b_n, \ - alpha, \ - a, inca, lda, \ - x, incx, \ - beta, \ - y, incy ); \ -} - -INSERT_GENTFUNC_BASIC( dotxf_ker, DOTXF_KERNEL ) - diff --git a/testsuite/src/test_dotxf.h b/testsuite/src/test_dotxf.h index dcd172029..0274c9ac2 100644 --- a/testsuite/src/test_dotxf.h +++ b/testsuite/src/test_dotxf.h @@ -34,30 +34,3 @@ void libblis_test_dotxf( test_params_t* params, test_op_t* op ); - -// -// Prototype wrapper interfaces to kernel. -// -void bli_dotxf_ker( obj_t* alpha, - obj_t* a, - obj_t* x, - obj_t* beta, - obj_t* y ); - -#undef GENTPROT -#define GENTPROT( ctype, ch, opname ) \ -\ -void PASTEMAC(ch,opname)( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - void* alpha, \ - void* a, inc_t inca, inc_t lda, \ - void* x, inc_t incx, \ - void* beta, \ - void* y, inc_t incy \ - ); - -INSERT_GENTPROT_BASIC( dotxf_ker ) - diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index 125644adb..f8c75d717 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -239,6 +239,8 @@ void libblis_test_gemm_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_gemm( alpha, a, b, beta, c ); + //bli_gemm4m( alpha, a, b, beta, c ); + //bli_gemm3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 3f330c158..1685fc2b2 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -256,6 +256,8 @@ void libblis_test_hemm_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_hemm( side, alpha, a, b, beta, c ); + //bli_hemm4m( side, alpha, a, b, beta, c ); + //bli_hemm3m( side, alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index 717c226d6..217dd24b8 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -255,6 +255,8 @@ void libblis_test_her2k_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_her2k( alpha, a, b, beta, c ); + //bli_her2k4m( alpha, a, b, beta, c ); + //bli_her2k3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 53e05909e..1f849a36d 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -245,6 +245,8 @@ void libblis_test_herk_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_herk( alpha, a, beta, c ); + //bli_herk4m( alpha, a, beta, c ); + //bli_herk3m( alpha, a, beta, c ); break; default: diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index f7bee0f97..4d7575682 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -568,7 +568,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " # of kc x nc blocks %u\n", BLIS_NUM_KC_X_NC_BLOCKS ); libblis_test_fprintf_c( os, " # of mc x nc blocks %u\n", BLIS_NUM_MC_X_NC_BLOCKS ); libblis_test_fprintf_c( os, " block address alignment %u\n", BLIS_CONTIG_ADDR_ALIGN_SIZE ); - libblis_test_fprintf_c( os, " panel stride alignment %u\n", BLIS_CONTIG_STRIDE_ALIGN_SIZE ); libblis_test_fprintf_c( os, " max preload byte offset %u\n", BLIS_MAX_PRELOAD_BYTE_OFFSET ); libblis_test_fprintf_c( os, " actual pool sizes (bytes) \n" ); libblis_test_fprintf_c( os, " for mc x kc blocks of A %u\n", BLIS_MK_POOL_SIZE ); @@ -617,6 +616,40 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) BLIS_EXTEND_NC_C, BLIS_EXTEND_NC_Z ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "4m/3m cache blocksizes 4c 4z 3c 3z \n" ); + libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", + BLIS_DEFAULT_4M_MC_C, + BLIS_DEFAULT_4M_MC_Z, + BLIS_DEFAULT_3M_MC_C, + BLIS_DEFAULT_3M_MC_Z ); + libblis_test_fprintf_c( os, " k dimension %5u %5u %5u %5u\n", + BLIS_DEFAULT_4M_KC_C, + BLIS_DEFAULT_4M_KC_Z, + BLIS_DEFAULT_3M_KC_C, + BLIS_DEFAULT_3M_KC_Z ); + libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n", + BLIS_DEFAULT_4M_NC_C, + BLIS_DEFAULT_4M_NC_Z, + BLIS_DEFAULT_3M_NC_C, + BLIS_DEFAULT_3M_NC_Z ); + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "4m/3m cache blksz exts 4c 4z 3c 3z \n" ); + libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_4M_MC_C, + BLIS_EXTEND_4M_MC_Z, + BLIS_EXTEND_3M_MC_C, + BLIS_EXTEND_3M_MC_Z ); + libblis_test_fprintf_c( os, " k dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_4M_KC_C, + BLIS_EXTEND_4M_KC_Z, + BLIS_EXTEND_3M_KC_C, + BLIS_EXTEND_3M_KC_Z ); + libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n", + BLIS_EXTEND_4M_NC_C, + BLIS_EXTEND_4M_NC_Z, + BLIS_EXTEND_3M_NC_C, + BLIS_EXTEND_3M_NC_Z ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-3 register blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n", BLIS_DEFAULT_MR_S, @@ -664,10 +697,10 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-1f fusing factors s d c z \n" ); libblis_test_fprintf_c( os, " default %5u %5u %5u %5u\n", - BLIS_DEFAULT_FUSE_FAC_S, - BLIS_DEFAULT_FUSE_FAC_D, - BLIS_DEFAULT_FUSE_FAC_C, - BLIS_DEFAULT_FUSE_FAC_Z ); + BLIS_L1F_FUSE_FAC_S, + BLIS_L1F_FUSE_FAC_D, + BLIS_L1F_FUSE_FAC_C, + BLIS_L1F_FUSE_FAC_Z ); libblis_test_fprintf_c( os, " axpyf %5u %5u %5u %5u\n", BLIS_AXPYF_FUSE_FAC_S, BLIS_AXPYF_FUSE_FAC_D, diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index af4c20cc9..c26fa2dd5 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -256,6 +256,8 @@ void libblis_test_symm_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_symm( side, alpha, a, b, beta, c ); + //bli_symm4m( side, alpha, a, b, beta, c ); + //bli_symm3m( side, alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 4b5619fc3..fb74237d6 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -255,6 +255,8 @@ void libblis_test_syr2k_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_syr2k( alpha, a, b, beta, c ); + //bli_syr2k4m( alpha, a, b, beta, c ); + //bli_syr2k3m( alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index 30ffe110c..f83c28c95 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -245,6 +245,8 @@ void libblis_test_syrk_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_syrk( alpha, a, beta, c ); + //bli_syrk4m( alpha, a, beta, c ); + //bli_syrk3m( alpha, a, beta, c ); break; default: diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index a5d420f6a..06edcc35e 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -243,6 +243,8 @@ void libblis_test_trmm_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_trmm( side, alpha, a, b ); + //bli_trmm4m( side, alpha, a, b ); + //bli_trmm3m( side, alpha, a, b ); break; default: diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index d524ab259..c1a1938b8 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -257,6 +257,8 @@ void libblis_test_trmm3_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_trmm3( side, alpha, a, b, beta, c ); + //bli_trmm34m( side, alpha, a, b, beta, c ); + //bli_trmm33m( side, alpha, a, b, beta, c ); break; default: diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index 2e6778221..534a26960 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -178,7 +178,6 @@ void libblis_test_trsm_experiment( test_params_t* params, } else { - //bli_setsc( 1.0, 0.5, &alpha ); bli_setsc( 2.0, 0.0, &alpha ); } @@ -189,8 +188,6 @@ void libblis_test_trsm_experiment( test_params_t* params, // Randomize A, make it densely triangular. bli_randm( &a ); bli_mktrim( &a ); - //bli_setsc( 0.5, 0.0, &kappa ); - //bli_scalm( &kappa, &a ); // Randomize B and save B. bli_randm( &b ); @@ -244,6 +241,8 @@ void libblis_test_trsm_impl( iface_t iface, { case BLIS_TEST_SEQ_FRONT_END: bli_trsm( side, alpha, a, b ); + //bli_trsm4m( side, alpha, a, b ); + //bli_trsm3m( side, alpha, a, b ); break; default: diff --git a/windows/build/bli_config.h b/windows/build/bli_config.h index 855e04865..4191767be 100644 --- a/windows/build/bli_config.h +++ b/windows/build/bli_config.h @@ -100,10 +100,6 @@ // from the contiguous memory allocator. #define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE -// Alignment size used when sizing strides (eg: of packed micro-panels) -// within a block of contiguous memory. -#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16 - // -- MIXED DATATYPE SUPPORT --------------------------------------------------- diff --git a/windows/build/bli_kernel.h b/windows/build/bli_kernel.h index 5cd716731..b7459b434 100644 --- a/windows/build/bli_kernel.h +++ b/windows/build/bli_kernel.h @@ -36,310 +36,6 @@ #define BLIS_KERNEL_H -// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- - -// -- Default cache blocksizes -- - -// -// Constraints: -// -// (1) MC must be a multiple of: -// (a) MR (for zero-padding purposes) -// (b) NR (for zero-padding purposes when MR and NR are "swapped") -// (2) NC must be a multiple of -// (a) NR (for zero-padding purposes) -// (b) MR (for zero-padding purposes when MR and NR are "swapped") -// (3) KC must be a multiple of -// (a) MR and -// (b) NR (for triangular operations such as trmm and trsm). -// - -#define BLIS_DEFAULT_MC_S 64 -#define BLIS_DEFAULT_KC_S 128 -#define BLIS_DEFAULT_NC_S 4096 - -#define BLIS_DEFAULT_MC_D 64 -#define BLIS_DEFAULT_KC_D 128 -#define BLIS_DEFAULT_NC_D 4096 - -#define BLIS_DEFAULT_MC_C 64 -#define BLIS_DEFAULT_KC_C 128 -#define BLIS_DEFAULT_NC_C 4096 - -#define BLIS_DEFAULT_MC_Z 64 -#define BLIS_DEFAULT_KC_Z 128 -#define BLIS_DEFAULT_NC_Z 4096 - -// -- Cache blocksize extensions (for optimizing edge cases) -- - -// NOTE: These cache blocksize "extensions" have the same constraints as -// the corresponding default blocksizes above. When these values are -// non-zero, blocksizes used at edge cases are extended (enlarged) if -// such an extension would encompass the remaining portion of the -// matrix dimension. - -#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4) -#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4) -#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4) - -#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4) -#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4) -#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4) - -#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4) -#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4) -#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4) - -#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4) -#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4) -#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4) - -// -- Default register blocksizes for micro-kernel -- - -// NOTE: When using the reference configuration, these register blocksizes -// in the m and n dimensions should all be equal to the size expected by -// the reference micro-kernel(s). - -#define BLIS_DEFAULT_MR_S 4 -#define BLIS_DEFAULT_NR_S 4 - -#define BLIS_DEFAULT_MR_D 4 -#define BLIS_DEFAULT_NR_D 4 - -#define BLIS_DEFAULT_MR_C 4 -#define BLIS_DEFAULT_NR_C 4 - -#define BLIS_DEFAULT_MR_Z 4 -#define BLIS_DEFAULT_NR_Z 4 - -// NOTE: If the micro-kernel, which is typically unrolled to a factor -// of f, handles leftover edge cases (ie: when k % f > 0) then these -// register blocksizes in the k dimension can be defined to 1. - -#define BLIS_DEFAULT_KR_S 1 -#define BLIS_DEFAULT_KR_D 1 -#define BLIS_DEFAULT_KR_C 1 -#define BLIS_DEFAULT_KR_Z 1 - -// -- Register blocksize extensions (for packed micro-panels) -- - -// NOTE: These register blocksize "extensions" determine whether the -// leading dimensions used within the packed micro-panels are equal to -// or greater than their corresponding register blocksizes above. - -#define BLIS_EXTEND_MR_S 0 -#define BLIS_EXTEND_NR_S 0 - -#define BLIS_EXTEND_MR_D 0 -#define BLIS_EXTEND_NR_D 0 - -#define BLIS_EXTEND_MR_C 0 -#define BLIS_EXTEND_NR_C 0 - -#define BLIS_EXTEND_MR_Z 0 -#define BLIS_EXTEND_NR_Z 0 - -// Register blocksize extensions in the k dimension are not used. - -#define BLIS_EXTEND_KR_S 0 -#define BLIS_EXTEND_KR_D 0 -#define BLIS_EXTEND_KR_C 0 -#define BLIS_EXTEND_KR_Z 0 - -// -- Default incremental packing blocksizes (n dimension) -- - -// NOTE: These incremental packing blocksizes (for the n dimension) are only -// used by certain blocked variants. But when the *are* used, they MUST be -// be an integer multiple of NR! - -#define BLIS_DEFAULT_NI_FAC 16 -#define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) -#define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) -#define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) -#define BLIS_DEFAULT_NI_Z (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z) - - - -// -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- - -// NOTE: These values determine high-level cache blocking for level-2 -// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and -// MC = NC = 1000, then a total of four unblocked (or unblocked fused) -// gemv subproblems are called. The blocked algorithms are only useful in -// that they provide the opportunity for packing vectors. (Matrices can also -// be packed here, but this tends to be much too expensive in practice to -// actually employ.) - -#define BLIS_DEFAULT_L2_MC_S 1000 -#define BLIS_DEFAULT_L2_NC_S 1000 - -#define BLIS_DEFAULT_L2_MC_D 1000 -#define BLIS_DEFAULT_L2_NC_D 1000 - -#define BLIS_DEFAULT_L2_MC_C 1000 -#define BLIS_DEFAULT_L2_NC_C 1000 - -#define BLIS_DEFAULT_L2_MC_Z 1000 -#define BLIS_DEFAULT_L2_NC_Z 1000 - - - -// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ - -// -- Default fusing factors for level-1f operations -- - -// NOTE: Default fusing factors are not used by the reference implementations -// of level-1f operations. They are here only for use when these operations -// are optimized. - -#define BLIS_DEFAULT_FUSE_FAC_S 8 -#define BLIS_DEFAULT_FUSE_FAC_D 4 -#define BLIS_DEFAULT_FUSE_FAC_C 4 -#define BLIS_DEFAULT_FUSE_FAC_Z 2 - -#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - -#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S -#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D -#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C -#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z - - - -// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------ - -// -- Default register blocksizes for vectors -- - -// NOTE: Register blocksizes for vectors are used when packing -// non-contiguous vectors. Similar to that of KR, they can -// typically be set to 1. - -#define BLIS_DEFAULT_VR_S 1 -#define BLIS_DEFAULT_VR_D 1 -#define BLIS_DEFAULT_VR_C 1 -#define BLIS_DEFAULT_VR_Z 1 - - - -// -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- - -// -- gemm -- - -#define GEMM_UKERNEL gemm_ref_mxn - -// -- trsm-related -- - -#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn -#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn - -#define TRSM_L_UKERNEL trsm_l_ref_mxn -#define TRSM_U_UKERNEL trsm_u_ref_mxn - - - -// -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- - -// -- packm -- - -#define PACKM_2XK_KERNEL packm_ref_2xk -#define PACKM_4XK_KERNEL packm_ref_4xk -#define PACKM_6XK_KERNEL packm_ref_6xk -#define PACKM_8XK_KERNEL packm_ref_8xk -#define PACKM_10XK_KERNEL packm_ref_10xk -#define PACKM_12XK_KERNEL packm_ref_12xk -#define PACKM_14XK_KERNEL packm_ref_14xk -#define PACKM_16XK_KERNEL packm_ref_16xk - -// -- unpackm -- - -#define UNPACKM_2XK_KERNEL unpackm_ref_2xk -#define UNPACKM_4XK_KERNEL unpackm_ref_4xk -#define UNPACKM_6XK_KERNEL unpackm_ref_6xk -#define UNPACKM_8XK_KERNEL unpackm_ref_8xk -#define UNPACKM_10XK_KERNEL unpackm_ref_10xk -#define UNPACKM_12XK_KERNEL unpackm_ref_12xk -#define UNPACKM_14XK_KERNEL unpackm_ref_14xk -#define UNPACKM_16XK_KERNEL unpackm_ref_16xk - - - -// -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- - -// -- axpy2v -- - -#define AXPY2V_KERNEL axpy2v_unb_var1 - -// -- dotaxpyv -- - -#define DOTAXPYV_KERNEL dotaxpyv_unb_var1 - -// -- axpyf -- - -#define AXPYF_KERNEL axpyf_unb_var1 - -// -- dotxf -- - -#define DOTXF_KERNEL dotxf_unb_var1 - -// -- dotxaxpyf -- - -#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1 - - - -// -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- - -// -- addv -- - -#define ADDV_KERNEL addv_unb_var1 - -// -- axpyv -- - -#define AXPYV_KERNEL axpyv_unb_var1 - -// -- copyv -- - -#define COPYV_KERNEL copyv_unb_var1 - -// -- dotv -- - -#define DOTV_KERNEL dotv_unb_var1 - -// -- dotxv -- - -#define DOTXV_KERNEL dotxv_unb_var1 - -// -- invertv -- - -#define INVERTV_KERNEL invertv_unb_var1 - -// -- scal2v -- - -#define SCAL2V_KERNEL scal2v_unb_var1 - -// -- scalv -- - -#define SCALV_KERNEL scalv_unb_var1 - -// -- setv -- - -#define SETV_KERNEL setv_unb_var1 - -// -- subv -- - -#define SUBV_KERNEL subv_unb_var1 - -// -- swapv -- - -#define SWAPV_KERNEL swapv_unb_var1