mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Conflicts: frame/1m/packm/bli_packm_blk_var1.c
This commit is contained in:
916
CHANGELOG
916
CHANGELOG
@@ -1,4 +1,918 @@
|
||||
commit 089048d5895a30221b6b1976c9be93ad6443420d (HEAD, tag: 0.1.0, origin/master, master)
|
||||
commit fde5f1fdece19881f50b142e8611b772a647e6d2 (HEAD, tag: 0.1.1, origin/master, origin/HEAD, master)
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Feb 25 13:34:56 2014 -0600
|
||||
|
||||
Added extensive support for configuration defaults.
|
||||
|
||||
Details:
|
||||
- Standard names for reference kernels (levels-1v, -1f and 3) are now
|
||||
macro constants. Examples:
|
||||
BLIS_SAXPYV_KERNEL_REF
|
||||
BLIS_DDOTXF_KERNEL_REF
|
||||
BLIS_ZGEMM_UKERNEL_REF
|
||||
- Developers no longer have to name all datatype instances of a kernel
|
||||
with a common base name; [sdcz] datatype flavors of each kernel or
|
||||
micro-kernel (level-1v, -1f, or 3) may now be named independently.
|
||||
This means you can now, if you wish, encode the datatype-specific
|
||||
register blocksizes in the name of the micro-kernel functions.
|
||||
- Any datatype instances of any kernel (1v, 1f, or 3) that is left
|
||||
undefined in bli_kernel.h will default to the corresponding reference
|
||||
implementation. For example, if BLIS_DGEMM_UKERNEL is left undefined,
|
||||
it will be defined to be BLIS_DGEMM_UKERNEL_REF.
|
||||
- Developers no longer need to name level-1v/-1f kernels with multiple
|
||||
datatype chars to match the number of types the kernel WOULD take in
|
||||
a mixed type environment, as in bli_dddaxpyv_opt(). Now, one char is
|
||||
sufficient, as in bli_daxpyv_opt().
|
||||
- There is no longer a need to define an obj_t wrapper to go along with
|
||||
your level-1v/-1f kernels. The framework now prvides a _kernel()
|
||||
function which serves as the obj_t wrapper for whatever kernels are
|
||||
specified (or defaulted to) via bli_kernel.h
|
||||
- Developers no longer need to prototype their kernels, and thus no
|
||||
longer need to include any prototyping headers from within
|
||||
bli_kernel.h. The framework now generates kernel prototypes, with the
|
||||
proper type signature, based on the kernel names defined (or defaulted
|
||||
to) via bli_kernel.h.
|
||||
- If the complex datatype x (of [cz]) implementation of the gemm micro-
|
||||
kernel is left undefined by bli_kernel.h, but its same-precision real
|
||||
domain equivalent IS defined, BLIS will use a 4m-based implementation
|
||||
for the datatype x implementations of all level-3 operations, using
|
||||
only the real gemm micro-kernel.
|
||||
|
||||
commit 15b51e990f1d21333b5f7af97c211756247336e5
|
||||
Merge: 6363a9f fc04b5e
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 21 09:04:32 2014 -0600
|
||||
|
||||
Merge branch 'master' of github.com:fgvanzee/blis
|
||||
|
||||
commit fc04b5eb69868c341ce03f5ef1f02de4b8c121b0
|
||||
Merge: b29e1c2 d1813c9
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 21 09:04:13 2014 -0600
|
||||
|
||||
Merge pull request #3 from figual/master
|
||||
|
||||
New ARM armv7a kernels and Assembly file consideration in Makefile
|
||||
|
||||
commit d1813c9dee34410833db5061e6588ec1a6c9ecd4
|
||||
Author: Francisco Igual <figual@pandaboard.(none)>
|
||||
Date: Fri Feb 21 15:14:31 2014 +0100
|
||||
|
||||
Added new armv7a micro-kernels and configuration files from Werner Saar.
|
||||
|
||||
commit 0cd098c03a000ed9426a7e9135190696da8cadbc
|
||||
Author: Francisco Igual <figual@pandaboard.(none)>
|
||||
Date: Fri Feb 21 15:12:30 2014 +0100
|
||||
|
||||
o Modified Makefile to consider .S assembly microkernels.
|
||||
|
||||
commit 6363a9f658257fe3d814a3dce5308f807adb54a2
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Feb 19 17:00:52 2014 -0600
|
||||
|
||||
Added level-3 support for complex via 4m-/3m.
|
||||
|
||||
Details:
|
||||
- Added the ability to induce complex domain level-3 operations via new
|
||||
virtual complex micro-kernels which are implemented via only real
|
||||
domain micro-kernels. Two new implementations are provided: 4m and 3m.
|
||||
4m implements complex matrix multiplication in terms of four real
|
||||
matrix multiplications, where as 3m uses only three and thus is
|
||||
capable of even higher (than peak) performance. However, the 3m method
|
||||
has somewhat weaker numerical properties, making it less desirable
|
||||
in general.
|
||||
- Further refined packing routines, which were recently revamped, and
|
||||
added packing functionality for 4m and 3m.
|
||||
- Some modifications to trmm and trsm macro-kernels to facilitate indexing
|
||||
into micro-panels which were packed for 4m/3m virtual kernels.
|
||||
- Added 4m and 3m interfaces for each level-3 operation.
|
||||
- Various other minor changes to facilitate 4m/3m methods.
|
||||
|
||||
commit b29e1c2b278c177e104c84ba462820ee8296df6c
|
||||
Merge: ee60377 bd3c7ec
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 14 14:11:54 2014 -0600
|
||||
|
||||
Merge pull request #2 from tlrmchlsmth/master
|
||||
|
||||
Fixes and improvements to xeon phi implementation.
|
||||
|
||||
commit bd3c7ecfb54a9b9851c7d364f41c21e4cff52f6f
|
||||
Author: Tyler Smith <tms@cs.utexas.edu>
|
||||
Date: Fri Feb 14 14:05:57 2014 -0600
|
||||
|
||||
Removing changes to input.general and input.operations
|
||||
|
||||
commit ce066863683cb4e910270cf8ab8e138b01ff3358
|
||||
Author: Tyler Smith <tms@cs.utexas.edu>
|
||||
Date: Fri Feb 14 13:40:24 2014 -0600
|
||||
|
||||
Fixed more Xeon Phi bugs, especially with scattered update
|
||||
|
||||
commit 31134b5c7076423aee1b4f494e925f27171d97e6
|
||||
Author: Tyler Smith <tms@cs.utexas.edu>
|
||||
Date: Fri Feb 14 11:19:44 2014 -0600
|
||||
|
||||
Some fixes, changes, and improvements to the microkernel to the Xeon Phi
|
||||
|
||||
commit ee60377e467862b9d8a7205c45dce5cf66c78c46
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 13 14:03:31 2014 -0600
|
||||
|
||||
Shifted some fields in info_t.
|
||||
|
||||
Details:
|
||||
- Shifted the pack order, pack buffer type, and structure type fields
|
||||
to make room for an extra bit in the pack type/status field.
|
||||
|
||||
commit bd3ab1ad4cf42f8bc30ab262acf8eccb49bb1a08
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 13 09:29:55 2014 -0600
|
||||
|
||||
Minor fixes to trsm consistent with prev on trmm.
|
||||
|
||||
Details:
|
||||
- Removed use of bli_min() and bli_max() that were only being used to
|
||||
try to support situations where the diagonal would intersect the
|
||||
short end of some micro-panels, which is situation that is disallowed
|
||||
at a higher level by various constraints on the register and cache
|
||||
blocksize. This only affected trsm_ll and trsm_lu.
|
||||
- Use panel stride as passed into the macro-kernel rather than compute
|
||||
it via k and PACKMR/PACKNR. This affects all macro-kernels of trsm.
|
||||
|
||||
commit 6260b0b5f8bd248f3f66e5a1c6854bdbd9d02ad0
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 13 09:19:56 2014 -0600
|
||||
|
||||
Fixed obscure bug in trmm_ll, trmm_lu.
|
||||
|
||||
Details:
|
||||
- Fixed an obscure bug in left-hand trmm that would only manifest when
|
||||
non-zero register blocksize extensions (PACKMR > MR or PACKNR > NR)
|
||||
are used.
|
||||
- Removed use of bli_min() and bli_max() that were only being used to
|
||||
try to support situations where the diagonal would intersect the
|
||||
short end of some micro-panels, which is situation that is disallowed
|
||||
at a higher level by various constraints on the register and cache
|
||||
blocksize. This only affected trmm_ll and trmm_lu.
|
||||
- Use panel stride as passed into the macro-kernel rather than compute
|
||||
it via k and PACKMR/PACKNR. This affects all macro-kernels of trmm.
|
||||
|
||||
commit 16915c1c1e55c660bf82141cdadf7c0860d5b464
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Feb 11 10:54:19 2014 -0600
|
||||
|
||||
Fixed an obscure bug in packm_cxk().
|
||||
|
||||
Details:
|
||||
- Fixed a bug in packm_cxk() whereby the packm ukernel was being chosen
|
||||
from ldp, which is always equal to PACKMR or PACKNR. The problem with
|
||||
this is that the pack ukernels were implicitly assuming that the
|
||||
panel dimension of the panel being packed was equal to ldp, which
|
||||
is not the case when the register blocksizes extensions are non-zero
|
||||
(ie: when PACKMR > MR or PACKNR > NR, whichever is applicable). This
|
||||
problem has been fixed by passing ldp into the pack ukernels, which
|
||||
now walk through the packed micro-panel region by incrementing by this
|
||||
value, rather than incrementing by the inherent panel dimension value
|
||||
assumed by each packm ukernel (e.g. 4 in the case of packm_ref_4xk).
|
||||
- Also fixed a very minor edge case inefficiency whereby pack ukernels
|
||||
smaller than the default were not being used in edge cases, and instead
|
||||
those situations were being handled by scal2m. This is related to the
|
||||
issue above, because the pack ukernel itself was being chosen based on
|
||||
ldp instead of the panel dimension.
|
||||
|
||||
commit b7da57b282c5a5e2208946e60309d2352f55351d
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Feb 11 10:28:23 2014 -0600
|
||||
|
||||
Updated calls to packm_blk_var2() in testsuite.
|
||||
|
||||
Details:
|
||||
- In ukernel testsuite modules, replaced calls to packm_blk_var2() with
|
||||
_var1(). Meant to include this in previous commit.
|
||||
|
||||
commit c255a293e25b2223c88e8800267cd06ad2a90041
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Feb 10 14:31:24 2014 -0600
|
||||
|
||||
Consolidated packm_blk_var2 and var3.
|
||||
|
||||
Details:
|
||||
- Consolidated the functionality previously supported by packm_blk_var2()
|
||||
and packm_blk_var3() into a new variant, packm_blk_var1().
|
||||
- Updates to packm_gen_cxk(), packm_herm_cxk.c(), and packm_tri_cxk()
|
||||
to accommodate above changes.
|
||||
- Removed packm_blk_var3() and retired packm_blk_var2() to
|
||||
frame/1m/packm/old.
|
||||
- Updated all level-3 _cntl_init() functions so that the new, more
|
||||
versatile packm_blk_var1 is used for all level-3 matrix packing.
|
||||
|
||||
commit 32d8f264ae7b28155f5d7b21dcc5ecb78da2e0ab
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sun Feb 9 10:07:37 2014 -0600
|
||||
|
||||
Refactored packm variants.
|
||||
|
||||
Details:
|
||||
- Revised packm_blk_var2() and _var3() by encapsulating the general,
|
||||
hermitian/symmetric, and triangular panel-packing subproblems into
|
||||
separate functions: packm_gen_cxk(), packm_herm_cxk(), and
|
||||
packm_tri_cxk(), respectively. Also, homogenized the packm code as
|
||||
well as the new specialized packm_*_cxk() code to further improve
|
||||
readability.
|
||||
|
||||
commit 6c8067028707947fcdf4f856a272e15bb9ed91e3
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 7 11:27:15 2014 -0600
|
||||
|
||||
Renamed enumerated type in testsuite and modules.
|
||||
|
||||
Details:
|
||||
- Renamed the test suite's "mt_impl_t" enumerated type to "iface_t", and
|
||||
renamed all corresponding "impl" variables to "iface".
|
||||
|
||||
commit 6c12598b1bc567f0b08f58aebdc753a1c1390378
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 6 18:26:35 2014 -0600
|
||||
|
||||
Employ simpler INSERT_ macro for ref ukernels.
|
||||
|
||||
Details:
|
||||
- Defined a new macro, INSERT_GENTFUNC_BASIC0, which takes only one
|
||||
argument--the base name of the function--and employed this macro
|
||||
in the reference micro-kernel files instead of the _BASIC macro,
|
||||
which takes one auxiliary argument. That argument was not being
|
||||
used and probably just acted to unnecessarily obfuscate.
|
||||
|
||||
commit 32cae66326b68706d0e695cfd60c9ca5bc32c534
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 6 18:06:42 2014 -0600
|
||||
|
||||
Fixed some instances of sloppy 'restrict' usage.
|
||||
|
||||
Details:
|
||||
- Fixed some technical incorrectness with some usage of the 'restrict'
|
||||
keyword in the reference trsm micro-kernels.
|
||||
- Tweak to testsuite/Makefile that causes rebuild if libblis was
|
||||
touched.
|
||||
|
||||
commit 7aceef7683e2a2aff3c7ec2a73508036af2e19e2
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 6 17:31:19 2014 -0600
|
||||
|
||||
Updated comments in macro-kernels.
|
||||
|
||||
Details:
|
||||
- Updated (and fixed some errors in) the "Assumptions/assertions" comment
|
||||
section of macro-kernels.
|
||||
- Changed register blocksizes of reference configuration to MR = 8 and
|
||||
NR = 4. It's always good for MR != NR in the reference configuration
|
||||
since it may help uncover bugs related to non-square micro-kernels.
|
||||
|
||||
commit 8fd292aa78950bcdf556605718f09d13f9575abc
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Feb 6 14:32:21 2014 -0600
|
||||
|
||||
Pass panel dimensions into macro-kernels.
|
||||
|
||||
Details:
|
||||
- Modified the interfaces to the datatype-specific macro-kernels so that:
|
||||
- pd_a and pd_b are passed in (which contain the panel dimensions of
|
||||
packed panels of a and b).
|
||||
- rs_a and cs_b are no longer passed in (they were guaranteed to be 1).
|
||||
- Modified implementations of datatype-specific macro-kernels so pd_a,
|
||||
pd_b, cs_a, and rs_b are used instead of cpp macros for MR, NR, PACKMR,
|
||||
and PACKNR, respectively.
|
||||
- Declare temporary c matrices (ct) as being maxmr-by-maxnr, which for now
|
||||
is equivalent to being mr-by-nr. maxmr and maxnr are declared in a new
|
||||
header file bli_kernel_post_macro_defs.h.
|
||||
|
||||
commit 3404e6657eabb017cd1580a2f1dd8e6fb13df923
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Feb 5 11:19:10 2014 -0600
|
||||
|
||||
Deprecated incremental blocksize macro const defs.
|
||||
|
||||
Details:
|
||||
- Removed macro constant definitions related to incremental blocksizes
|
||||
from all configurations' bli_kernel.h files. This change is minor and
|
||||
is mostly a cleanup related to a previous commit.
|
||||
|
||||
commit 1e9afd39a63e0a58167d4439c1a0a880a4a35657
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Feb 4 20:15:19 2014 -0600
|
||||
|
||||
Comment updates (removed vestiges of "bd").
|
||||
|
||||
commit 5cf58f7c2d5bc0d2d94d9576f7158d8f133b7aac
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Feb 4 09:15:19 2014 -0600
|
||||
|
||||
Added early returns for "object is zeros" case.
|
||||
|
||||
Details:
|
||||
- Added some logic to packm_init(), pack_int() and gemm_int() so that
|
||||
(a) objects marked as BLIS_ZEROS are not packed, and (b) those
|
||||
objects are not computed with. This functionality is not currently
|
||||
needed by any existing implementations, but may be used in the
|
||||
future.
|
||||
|
||||
commit 6bbd4be769a9b344a55abe5ddaca1a99fd29f7b4
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Feb 3 13:15:25 2014 -0600
|
||||
|
||||
Added 'f' on some gemm and trmm blocked variants.
|
||||
|
||||
Details:
|
||||
- Added 'f' to some block variant files/functions to be consistent with
|
||||
other file/functions' naming convention. Here, the f indicates
|
||||
partitioning in the "forward" direction.
|
||||
|
||||
commit eb13cb2c6b182df5e2a9b88c76f50e2cee25b9e0
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Feb 3 11:07:01 2014 -0600
|
||||
|
||||
Removed redundant non-gemm blksz_t creation.
|
||||
|
||||
Details:
|
||||
- Removed code that creates duplicate blksz_t objects for herk, trmm,
|
||||
and trsm. Instead, the gemm blksz_t objects are accessed via extern
|
||||
and used directly. This reduces the amount of code associated with
|
||||
each of the three _cntl_init() and _cntl_finalize() function.
|
||||
|
||||
commit 0a023a7d9e58e53b8c204a5f49aa8ca9afeba938
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Jan 29 14:02:08 2014 -0600
|
||||
|
||||
Introduced new level-3 front-end layer.
|
||||
|
||||
Details:
|
||||
- Added new _front() functions for each level-3 operation. This is done
|
||||
so that the choosing of the control tree (and *only* the choosing of
|
||||
the control tree) happens in what was previously the "front end"
|
||||
(e.g. bli_gemm()). That control tree is then passed into the _front()
|
||||
function, which then performs up-front tasks such as parameter
|
||||
checking.
|
||||
|
||||
commit 251c5d112196d37b183e554bc9d406104aed65fb
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Jan 28 19:40:29 2014 -0600
|
||||
|
||||
Removed redundant hemm, her2k control trees.
|
||||
|
||||
Details:
|
||||
- Removed code that generated a control tree specifically for hemm and
|
||||
symm. Instead, the gemm control tree is now configured so that it
|
||||
works for gemm, hemm, or symm.
|
||||
- Retired most her2k code, as it was not being used. (Currently, her2k is
|
||||
implemented as two invocations of herk.) I couldn't think of many
|
||||
situations where her2k variants were needed.
|
||||
- Removed some older her2k code.
|
||||
|
||||
commit 5a36e5bf2f59d1e85d6dbce32a07d604c5e82d11
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jan 27 11:13:00 2014 -0600
|
||||
|
||||
Embed func_t microkernel objects in control trees.
|
||||
|
||||
Details:
|
||||
- Modified all control tree node definitions to include a new field of
|
||||
type func_t*, which is similar to a blksz_t except that it contains
|
||||
one function pointer (each typed simply as void*) for each datatype.
|
||||
We use the func_t* to embed pointers to the micro-kernels to use for
|
||||
the leaf-level nodes of each control tree. This change is a natural
|
||||
extension of control trees and will allow more flexibility in the
|
||||
future.
|
||||
- Modified all macro-kernel wrappers to obtain the micro-kernel pointers
|
||||
from the incomming (previously ignored) control tree node and then pass
|
||||
the queried pointer into the datatype-specific macro-kernel code, which
|
||||
then casts the pointer to the appropriate type (new typedefs residing
|
||||
in bli_kernel_type_defs.h) and then uses the pointer to call the micro-
|
||||
kernel. Thus, the micro-kernel function is no longer "hard-coded" (that
|
||||
is, determined when the datatype-specific macro-kernel functions are
|
||||
instantiated by the C preprocessor).
|
||||
- Added macros to bli_kernel_macro_defs.h that build datatype-specific
|
||||
base names if they do not exist already, and then uses those to build
|
||||
datatype-specific micro-kernel function names. This will allow
|
||||
developers extra flexibility if they wanted to, for example, name each
|
||||
of their datatype-specific micro-kernels differently (e.g. double
|
||||
real might be named bli_dgemm_opt_4x4() while double complex might be
|
||||
named bli_zgemm_opt_2x2()).
|
||||
- Inserted appropriate code into _cntl_init() functions that allocates
|
||||
and initializes a func_t object for the corresponding micro-kernels.
|
||||
The gemm ukernel func_t object is created once, in bli_gemm_cntl_init(),
|
||||
and then reused via extern wherever possible.
|
||||
|
||||
commit 6cbd6f1c7f1915180aa28939833afde48665c5ae
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Jan 24 10:38:29 2014 -0600
|
||||
|
||||
Removed commented mixed domain macro-kernel code.
|
||||
|
||||
Details:
|
||||
- Removed commented-out code from macro-kernels that was supposed to
|
||||
facilitate implementing mixed domain (complex times real) matrix
|
||||
multiplication. This functionality is still (probably possible),
|
||||
but I'm getting tired of looking at the code every time I edit
|
||||
a macro-kernel. Plus, there are probably ways of doing it at a
|
||||
higher level, via control trees.
|
||||
|
||||
commit 29778be1119f1a884330d7f8dc424a2df4101d58
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Jan 22 16:03:11 2014 -0600
|
||||
|
||||
Removed b_aux field from cntl nodes.
|
||||
|
||||
Details:
|
||||
- Removed b_aux field from all control tree node definitions. This field
|
||||
was being used in certain optimizations (incremental blocking) that were
|
||||
not actually being employed within BLIS, and are probably not employed
|
||||
by others.
|
||||
- Updated all _cntl_obj_create() function definitions and invocations
|
||||
according to above change.
|
||||
- Retired bli_gemm_blk_var4.c, which was one such function that employed
|
||||
incremental blocking, but which was never called by BLIS itself.
|
||||
|
||||
commit 06ac727a42ec9e832c7832745036702014638f99
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Jan 15 16:44:52 2014 -0600
|
||||
|
||||
Updated some comments in level-3 front ends.
|
||||
|
||||
commit d628bf1da1560f1f5126a1ddfed8714f0a4b8da3
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Jan 15 11:40:12 2014 -0600
|
||||
|
||||
Consolidated pack_t enums; retired VECTOR value.
|
||||
|
||||
Details:
|
||||
- Changed the pack_t enumerations so that BLIS_PACKED_VECTOR no longer has
|
||||
its own value, and instead simply aliases to BLIS_PACKED_UNSPEC. This
|
||||
makes room in the three pack_t bits of the info field of obj_t so that
|
||||
two values are now unused, and may be used for other future purposes.
|
||||
- Updated sloppy terminology usage in comments in level-2 front-ends.
|
||||
(Replaced "is contiguous" with more accurate "has unit stride".)
|
||||
|
||||
commit ddc8c1c379b4787be5954802906593d7ea144452
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jan 13 14:55:43 2014 -0600
|
||||
|
||||
Suppress warning in Makefile (UNINSTALL_LIBS).
|
||||
|
||||
Details:
|
||||
- Redirect errors to /dev/null when using 'find' to locate libraries that
|
||||
would be uninstalled upon executing "make uninstall-old". Before, if the
|
||||
Makefile was read before $(INSTALL_PREFIX)/lib existed, a "No such file
|
||||
or directory" message was emitted. This message was harmless, but is now
|
||||
suppressed in this situation.
|
||||
|
||||
commit f8f67d7251bffc05020e20527c100c8115fd5e55
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Jan 10 09:06:11 2014 -0600
|
||||
|
||||
Typecast bli_getopt() return value in testsuite.
|
||||
|
||||
Details:
|
||||
- In the test suite driver, inserted an explicit typecast of the return
|
||||
value of bli_getopt() prior parsing. The lack of typecast caused a
|
||||
problem on at least one system whereby a return value of -1 was
|
||||
interpreted as garbage character. Thanks to Francisco Igual for finding
|
||||
and submitting this fix.
|
||||
|
||||
commit e7f154fe2ed3e10e2323cefe5d25c2c23ac902c4
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Jan 10 08:48:07 2014 -0600
|
||||
|
||||
Applied edge case fix to arm/neon microkernel.
|
||||
|
||||
Details:
|
||||
- Applied an edge case bugfix, courtesy of Francisco Igual, to the current
|
||||
double precision real gemm microkernel in kernels/arm/neon/3.
|
||||
|
||||
commit 89c76a8a51d070d263c13bfa5ace65769509f2b4
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Jan 9 12:08:37 2014 -0600
|
||||
|
||||
Allow building outside source distribution.
|
||||
|
||||
Details:
|
||||
- Modified build system (mostly configure and top-level Makefile) so that
|
||||
a user can build a BLIS library outside of the top-level directory of
|
||||
the source distribution.
|
||||
- Added "test" target to Makefile so that the user can run "make test",
|
||||
which will compile, link, and run the testsuite binary. This works even
|
||||
if the build directory is externally located, thanks to the test suite
|
||||
binary's new -g and -o command-line options. Also, when creating the
|
||||
test suite via the top-level Makefile, the linking is against the
|
||||
local archive, in lib/<configname>, rather than at <install_prefix>/lib.
|
||||
- Modified testsuite/Makefile so that it links against the library built
|
||||
locally, in ../lib/<configname>.
|
||||
- Added "-lm" to LDFLAGS of most configurations' make_defs.mk.
|
||||
- Various other cleanups to build system.
|
||||
|
||||
commit 12fa82ec12cc340ab28552997d9d50f7c98691f8
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Jan 8 16:09:26 2014 -0600
|
||||
|
||||
Implemented bli_getopt().
|
||||
|
||||
Details:
|
||||
- Added bli_getopt.c and .h files to frame/base. These files implement
|
||||
a custom version of getopt(), which may be used to parse command line
|
||||
options passed into a program via argc/argv. I am implementing this
|
||||
function myself, as opposed to using the version available via unistd.h,
|
||||
for portability reasons, as the only requirements are string.h (which
|
||||
is available via the standard C library).
|
||||
- Modified test suite to allow the user to specify the file name (and/or
|
||||
path) to the parameters and operations input files: -g may be used to
|
||||
specify the general input file and -o to specify the operations input
|
||||
file). If -g or -o or both are not given, default filenames are assumed
|
||||
(as well as their existence in the current directory).
|
||||
|
||||
commit cafb58e86ea5cfb21b9eedc57ca8ebbf24252098
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jan 6 13:28:36 2014 -0600
|
||||
|
||||
Updated template micro-kernels to use auxinfo_t.
|
||||
|
||||
Details:
|
||||
- Updated template micro-kernel implementations (located in
|
||||
config/template/kernels), to adhere to the new auxinfo_t interface.
|
||||
Meant to include this change in a0331fb1.
|
||||
- Changed template configuration to use 64-bit integers (for both BLIS
|
||||
and the BLAS compatibility layer).
|
||||
|
||||
commit 9ab126b499c3805045020cb89a8a5848e28d3bf5
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Jan 6 12:13:26 2014 -0600
|
||||
|
||||
Removed error checks in netlib->BLIS param mapping
|
||||
|
||||
Details:
|
||||
- Disabled error checking in netlib-to-BLIS parameter mapping functions.
|
||||
If the char value input to these functions was not one of the defined
|
||||
values, bli_check_error_code() with the appropriate error code value
|
||||
would be called, resulting in an abort(). This was unnecessary and
|
||||
redundant since these routines are currently only used within the
|
||||
BLAS compatibility layer, and they are only called AFTER parameter
|
||||
checking has already been performed on the original BLAS char values.
|
||||
If the application tried to override xerbla() to prevent an abort()
|
||||
from being called, this error checking would still get in the way.
|
||||
Thus, instead of reporting the error situation to the framework (ie:
|
||||
calling abort()), an arbitrary BLIS parameter value is now chosen and
|
||||
the function returns normally. Thanks to Jeff Hammond for finding and
|
||||
reporting this issue.
|
||||
|
||||
commit 2cb13600f9f9601c60e7f96f4ca159d169ade9cb
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Jan 3 12:29:13 2014 -0600
|
||||
|
||||
Updated year in copyright headers to 2014.
|
||||
|
||||
commit 290fa54e0083c9c837188b8321b13b1b282e7b0c
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Dec 20 14:10:26 2013 -0600
|
||||
|
||||
Store variable panel strides in trmm/trsm auxinfo.
|
||||
|
||||
Details:
|
||||
- Changed the value being stored into the auxinfo_t structure in trmm
|
||||
and trsm macro-kernels. Whereas before we stored whatever value was
|
||||
provided to the macro-kernel implementation via ps_a/ps_b, now we
|
||||
store the stride that will advance to the next variable-length
|
||||
micro-panel of the triangular matrix A (left) or B (right).
|
||||
- Whitespace changes to the files affected above.
|
||||
|
||||
commit e3a6c7e77667fd749248df3f75f880266c3136ec
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Dec 19 16:29:31 2013 -0600
|
||||
|
||||
Macroized conditionals for a2/b2 in macro-kernels.
|
||||
|
||||
Details:
|
||||
- Replaced conditional expressions in macro-kernels related to computing
|
||||
the addresses a2 and b2 (a_next and b_next) with a preprocessor macro
|
||||
invocation, bli_is_last_iter(), that tests the same condition.
|
||||
- Updated gemm_ukr module to use auxinfo_t argument.
|
||||
- Whitespace changes in test suite ukr modules.
|
||||
|
||||
commit a0331fb10a50393e31d16339053b75b944132da1
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Dec 19 14:50:11 2013 -0600
|
||||
|
||||
Introduced auxinfo_t argument to micro-kernels.
|
||||
|
||||
Details:
|
||||
- Removed a_next and b_next arguments to micro-kernels and replaced them
|
||||
with a pointer to a new datatype, auxinfo_t, which is simply a struct
|
||||
that holds a_next and b_next. The struct may hold other auxiliary
|
||||
information that may be useful to a micro-kernel, such as micro-panel
|
||||
stride. Micro-kernels may access struct fields via accessor macros
|
||||
defined in bli_auxinfo_macro_defs.h.
|
||||
- Updated all instances of micro-kernel definitions, micro-kernel calls,
|
||||
as well as macro-kernels (for declaring and initializing the structs)
|
||||
according to above change.
|
||||
|
||||
commit 392428dea4001fe4384efe29f6cde32f8abeeb35
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Dec 12 19:01:47 2013 -0600
|
||||
|
||||
Added "ri" scalar macros.
|
||||
|
||||
Details:
|
||||
- Added set of basic scalar macros that take arguments' real and
|
||||
imaginary components separately, named like the previous set except
|
||||
with the "ris" (instead of "s") suffix.
|
||||
- Redefined the previous set of scalar macros (those that take arguments
|
||||
"whole") in terms of the new "ri" set.
|
||||
- Renamed setris and getris macros to sets and gets.
|
||||
- Renamed setimag0 macros to seti0s.
|
||||
- Use bli_?1 macro instead of a local constant in bla_trmv.c, bla_trsv.c.
|
||||
|
||||
commit f60c8adc2f61eaba06b892f4e73000159de93056
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Dec 10 14:39:56 2013 -0600
|
||||
|
||||
Minor updates to dunnington configuration.
|
||||
|
||||
Details:
|
||||
- Added commented alternatives to dunnington configuration's bli_kernel.h.
|
||||
- Minor reformatting of optimization flag variables in make_defs.mk.
|
||||
|
||||
commit 4ef20150492db254b5baf2368add62e19b0ac11b
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Dec 9 18:53:03 2013 -0600
|
||||
|
||||
Tweaks to dunnington configuration (x86_64/core2).
|
||||
|
||||
Details:
|
||||
- Updated BLIS_DEFAULT_KC_D from 256 to 384.
|
||||
- Enabled cache blocksize extension of up to 25% for MC and KC (for
|
||||
double-precision real).
|
||||
|
||||
commit 5ad2ce7bf5ba3ea955e6d517bfd270e02820263b
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Dec 9 18:30:49 2013 -0600
|
||||
|
||||
Minor x86_64 (core2) kernel fixes.
|
||||
|
||||
Details:
|
||||
- Fixed copy-and-paste bug whereby [scz]gemmtrsm_u_opt_d4x4 kernels
|
||||
for x86_64/core2 were calling the wrong reference code (l instead
|
||||
of u).
|
||||
- Fixed some unused variables in x86_64/core2 dotaxpyv and dotxaxpyf
|
||||
kernels.
|
||||
- Minor typecasting fix in testsuite/src/test_libblis.c.
|
||||
- Makefile updates.
|
||||
|
||||
commit d289f5d3a9c0e1a68a17c1c32b736e282a289c4c
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Dec 5 10:56:13 2013 -0600
|
||||
|
||||
Whitespace changes to level-2 blocked variants.
|
||||
|
||||
Details:
|
||||
- Joined some lines in level-2 blocked variants to match formatting used
|
||||
in level-3 blocked variants.
|
||||
- Streamlined implementation of bli_obj_equals() in bli_query.c.
|
||||
|
||||
commit b444489f100d218bc8ef29b01ff8489c358559f9
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Dec 3 16:08:30 2013 -0600
|
||||
|
||||
Added new "attached" scalar representation.
|
||||
|
||||
Details:
|
||||
- Added infrastructure to support a new scalar representation, whereby
|
||||
every object contains an internal scalar that defaults to 1.0. This
|
||||
facilitates passing scalars around without having to house them in
|
||||
separate objects. These "attached" scalars are stored in the internal
|
||||
atom_t field of the obj_t struct, and are always stored to be the same
|
||||
datatype as the object to which they are attached. Level-3 variants no
|
||||
longer take scalar arguments, however, level-3 internal back-ends stll
|
||||
do; this is so that the calling function can perform subproblems such
|
||||
as C := C - alpha * A * B on-the-fly without needing to change either
|
||||
of the scalars attached to A or B.
|
||||
- Removed scalar argument from packm_int().
|
||||
- Observe and apply attached scalars in scalm_int(), and removed scalar
|
||||
from interface of scalm_unb_var1().
|
||||
- Renamed the following functions (and corresponding invocations):
|
||||
|
||||
bli_obj_init_scalar_copy_of()
|
||||
-> bli_obj_scalar_init_detached_copy_of()
|
||||
bli_obj_init_scalar() -> bli_obj_scalar_init_detached()
|
||||
bli_obj_create_scalar_with_attached_buffer()
|
||||
-> bli_obj_create_1x1_with_attached_buffer()
|
||||
bli_obj_scalar_equals() -> bli_obj_equals()
|
||||
|
||||
- Defined new functions:
|
||||
|
||||
bli_obj_scalar_detach()
|
||||
bli_obj_scalar_attach()
|
||||
bli_obj_scalar_apply_scalar()
|
||||
bli_obj_scalar_reset()
|
||||
bli_obj_scalar_has_nonzero_imag()
|
||||
bli_obj_scalar_equals()
|
||||
|
||||
- Placed all bli_obj_scalar_* functions in a new file, bli_obj_scalar.c.
|
||||
- Renamed the following macros:
|
||||
|
||||
bli_obj_scalar_buffer() -> bli_obj_buffer_for_1x1()
|
||||
bli_obj_is_scalar() -> bli_obj_is_1x1()
|
||||
|
||||
- Defined new macros to set and copy internal scalars between objects:
|
||||
|
||||
bli_obj_set_internal_scalar()
|
||||
bli_obj_copy_internal_scalar()
|
||||
|
||||
- In level-3 internal back-ends, added conditional blocks where alpha and
|
||||
beta are checked for non-unit-ness. Those values for alpha and beta are
|
||||
applied to the scalars attached to aliases of A/B/C, as appropriate,
|
||||
before being passed into the variant specified by the control tree.
|
||||
- In level-3 blocked variants, pass BLIS_ONE into subproblems instead of
|
||||
alpha and/or beta.
|
||||
- In level-3 macro-kernels, changed how scalars are obtained. Now, scalars
|
||||
attached to A and B are multiplied together to obtain alpha, while beta
|
||||
is obtained directly from C.
|
||||
- In level-3 front-ends, removed old function calls meant to provide
|
||||
future support for mixed domain/precision. These can be added back later
|
||||
once that functionality is given proper treatment. Also, removed the
|
||||
creating of copy-casts of alpha and beta since typecasting of scalars
|
||||
is now implicitly handled in the internal back-ends when alpha and
|
||||
beta are applied to the attached scalars.
|
||||
|
||||
commit 992de486d6f23e69a623abd15ae77d7881d13871
|
||||
Merge: 9552e6e fd4ac63
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Dec 2 13:58:46 2013 -0600
|
||||
|
||||
Unimplemented kernels now call reference.
|
||||
|
||||
Details:
|
||||
- Updated arm, bgq, loongson3a, and x86_64 kernels so that unimplemented
|
||||
datatypes call the corresponding reference kernel. Previously, these
|
||||
kernel functions called abort() with a "not yet implemented" error
|
||||
message.
|
||||
|
||||
commit fd4ac636d9a55cec1476a444bd4e70def219dc8f
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Dec 2 13:50:36 2013 -0600
|
||||
|
||||
Unimplemented kernels now call reference.
|
||||
|
||||
Details:
|
||||
- Updated micro-kernels for arm, bgq, loongson3a, and x86_64 so that
|
||||
unimplemented kernel functions simply call the corresponding reference
|
||||
implementation. (Previously, these unimplemented functions would
|
||||
abort() with a "not yet implemented" message.)
|
||||
|
||||
commit 9552e6ee824d4345d5e908e869e071d19829819a
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sun Nov 24 11:40:31 2013 -0600
|
||||
|
||||
Removed optional scaling from packm control tree.
|
||||
|
||||
Details:
|
||||
- Removed does_scale field from packm control tree node and
|
||||
bli_packm_cntl_obj_create() interface. Adjusted all invocations of
|
||||
_cntl_obj_create() accordingly.
|
||||
- Redefined/renamted macros that are used in aliasing so that now,
|
||||
bli_obj_alias_to() does a full alias (shallow copy) while
|
||||
bli_obj_alias_for_packing() does a partial alias that preserves the
|
||||
pack_mem-related fields of the aliasing (destination) object.
|
||||
- Removed bli_trmm3_cntl.c, .h after realizing that the trmm control tree
|
||||
will work just fine for bli_trmm3().
|
||||
- Removed some commented vestiges of the typecasting functionality needed
|
||||
to support heterogeneous datatypes.
|
||||
|
||||
commit e65c476284db9ef64b23191a21c2584b1083342f
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Tue Nov 19 10:05:35 2013 -0600
|
||||
|
||||
Minor updates to packm_blk_var2.c and _blk_var3.c.
|
||||
|
||||
Details:
|
||||
- Comment updates to packm_blk_var2.c and packm_blk_var3.c.
|
||||
- In packm_blk_var2(), call setm_unb_var1(), scal2m_unb_var1() directly
|
||||
instead of setm(), scal2m().
|
||||
|
||||
commit 9e1d0d4bca48eda54301d8976f203e2544c9df3a
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 18:11:07 2013 -0600
|
||||
|
||||
Added trsm_l, trsm_u ukernels for x86_64/core2.
|
||||
|
||||
Details:
|
||||
- Added standalone trsm_l/trsm_u micro-kernels for x86_64 (core2).
|
||||
These kernels are based on the gemmtrsm_l/gemmtrsm_u micro-kernels
|
||||
that already existed in kernels/x86_64/core2-sse3/3.
|
||||
|
||||
commit 85e7e02ea3a9190b6fcff5d46b00d41c79cb1242
|
||||
Merge: 67761e2 7072005
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 12:02:00 2013 -0600
|
||||
|
||||
Merge branch 'master'. Forgot to git-pull.
|
||||
|
||||
commit 67761e224c92500eecf9c1540cc72bdd2fb27679
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 11:57:40 2013 -0600
|
||||
|
||||
Attempting to fix errors in bgq build.
|
||||
|
||||
Details:
|
||||
- Removed restrict declaration from b_cast and c_cast from
|
||||
bli_trsm_lu_ker_var2.c and bli_trsm_rl_ker_var2.c. Curiously, they
|
||||
are causing problems for xlc only in those two files and no other
|
||||
macro-kernels.
|
||||
- Fixed (hopefully) kernel function parameter type declarations in
|
||||
kernels/bgq/1f/bli_axpyf_opt_var1.c and kernels/bgq/3/bli_gemm_8x8.c.
|
||||
|
||||
commit 707200541d344f98cf34c9801954dbb36fbe0447
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 11:17:31 2013 -0600
|
||||
|
||||
Syntax error fix in x86_64/core2 gemmtrsm_u ukr.
|
||||
|
||||
commit bbe2b84a49e7785d4d0c514cda34adfbe66478b0
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 11:11:06 2013 -0600
|
||||
|
||||
Updated Makefile in test, testsuite.
|
||||
|
||||
Details:
|
||||
- Updated Makefiles in test and testsuite directories to use the new
|
||||
BLIS header installation directory scheme, which is to compile with
|
||||
-I<PREFIX>/include/blis instead of -I<PREFIX>/include.
|
||||
|
||||
commit 9bd7fcfd436625ca2108128086671319362f4d92
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 18 10:58:09 2013 -0600
|
||||
|
||||
Outer-to-inner 'restrict' fix in macro-kernels.
|
||||
|
||||
Details:
|
||||
- Fixed sloppy placement of 'restrict' pointer declarations in level-3
|
||||
macro-kernels. Previously, all restricted pointers were being declared
|
||||
at the outer-most function scope level. While this violates the C99
|
||||
standard, very few of the compilers used with BLIS so far have seemed
|
||||
to care. The lone exception has been IBM's xlc. Thanks to Tyler Smith
|
||||
for identifying this bug (and suggesting the fix).
|
||||
|
||||
commit 50549a6a31dd26cf63a013e0ede16b2c7ce835b6
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sun Nov 17 18:31:27 2013 -0600
|
||||
|
||||
Changed header install directory to include/blis.
|
||||
|
||||
Details:
|
||||
- Changed top-level Makefile so that headers are installed to
|
||||
$(INSTALL_PREFIX)/include/blis/. (Header directories are no longer
|
||||
named by version/configuration and then symlinked.)
|
||||
- Added uninstall targets, including uninstall-old to clean out old
|
||||
library archives.
|
||||
- Added GREP makefile definitions to all configurations' make_defs.mk.
|
||||
|
||||
commit d70733abddfb9a95661897e1e4f3c1f3cfa7cbaa
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sat Nov 16 17:34:25 2013 -0600
|
||||
|
||||
Added ARM kernels, configurations.
|
||||
|
||||
Details:
|
||||
- Added kernels for ARM, and configurations for Cortex-A9 and Cortex-A15.
|
||||
Thanks to Francisco Igual for contributing these kernels and
|
||||
configurations.
|
||||
|
||||
commit d37c2cff62089c86983c2f79762f4b5329037373
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Wed Nov 13 10:47:11 2013 -0600
|
||||
|
||||
Minor comment and Makefile changes.
|
||||
|
||||
Details:
|
||||
- Added missing 'check-config' and 'check-make-defs' targets to
|
||||
testsuite/Makefile.
|
||||
- Removed unused 'test' target from top-level Makefile.
|
||||
- Comment changes to testsuite input files.
|
||||
|
||||
commit 19885f893a17b91ee79bead0620d0f913392d4c5
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 11 12:09:21 2013 -0600
|
||||
|
||||
Updated some kernel comment headers.
|
||||
|
||||
Details:
|
||||
- Updated bgq and piledriver comment headers to use BLIS copyright header
|
||||
instead of libflame.
|
||||
|
||||
commit 1a4d698f42981d74fe5f29b980031e1ee7dc42d5
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Mon Nov 11 10:15:40 2013 -0600
|
||||
|
||||
CHANGELOG update (for 0.1.0).
|
||||
|
||||
commit 089048d5895a30221b6b1976c9be93ad6443420d (tag: 0.1.0)
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Sat Nov 9 17:18:00 2013 -0600
|
||||
|
||||
|
||||
22
Makefile
22
Makefile
@@ -317,18 +317,26 @@ CFLAGS_KERNELS := $(CFLAGS_KERNELS) $(VERS_DEF)
|
||||
# Convert source file paths to object file paths by replacing the base source
|
||||
# directories with the base object directories, and also replacing the source
|
||||
# file suffix (eg: '.c') with '.o'.
|
||||
MK_BLIS_CONFIG_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_SRC)))
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_NOOPT_SRC)))
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(FRAME_PATH)/%.c, $(BASE_OBJ_FRAME_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_FRAME_KERNELS_SRC)))
|
||||
|
||||
MK_BLIS_FRAME_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
MK_BLIS_CONFIG_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_SRC)))
|
||||
MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_SRC)))
|
||||
MK_BLIS_FRAME_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_NOOPT_SRC)))
|
||||
MK_BLIS_CONFIG_NOOPT_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_NOOPT_SRC)))
|
||||
MK_BLIS_FRAME_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS := $(patsubst $(CONFIG_PATH)/%.S, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.S, $(MK_CONFIG_KERNELS_SRC)))
|
||||
MK_BLIS_CONFIG_KERNELS_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/%.o, \
|
||||
$(filter %.c, $(MK_CONFIG_KERNELS_SRC)))
|
||||
|
||||
# Combine all of the object files into some readily-accessible variables.
|
||||
@@ -427,7 +435,7 @@ else
|
||||
@$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
|
||||
endif
|
||||
|
||||
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.c $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
|
||||
$(BASE_OBJ_CONFIG_PATH)/%.o: $(CONFIG_PATH)/%.[cS] $(MK_HEADER_FILES) $(MAKE_DEFS_MK_PATH)
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(CC) $(call get_cflags_for_obj,$@) -c $< -o $@
|
||||
else
|
||||
|
||||
165
config/armv7a/bli_config.h
Normal file
165
config/armv7a/bli_config.h
Normal file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// -- INTEGER PROPERTIES -------------------------------------------------------
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions,
|
||||
// strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed
|
||||
// integers while 64 results in 64-bit integers. Any other value results in use
|
||||
// of the C99 type "long int". Note that this ONLY affects integers used
|
||||
// internally within BLIS as well as those exposed in the native BLAS-like BLIS
|
||||
// interface.
|
||||
#define BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- FLOATING-POINT PROPERTIES ------------------------------------------------
|
||||
|
||||
// Define the number of floating-point types supported, and the size of the
|
||||
// largest type.
|
||||
#define BLIS_NUM_FP_TYPES 4
|
||||
#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex)
|
||||
|
||||
// Enable use of built-in C99 "float complex" and "double complex" types and
|
||||
// associated overloaded operations and functions? Disabling results in
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
|
||||
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
// -- Contiguous (static) memory allocator --
|
||||
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
// memory pools so that the micro-kernel, when computing with the end of the
|
||||
// last block, can exceed the bounds of the usable portion of the memory
|
||||
// region without causing a segmentation fault.
|
||||
#define BLIS_MAX_PRELOAD_BYTE_OFFSET 128
|
||||
|
||||
// -- Memory alignment --
|
||||
|
||||
// It is sometimes useful to define the various memory alignments in terms
|
||||
// of some other characteristics of the system, such as the cache line size
|
||||
// and the page size.
|
||||
#define BLIS_CACHE_LINE_SIZE 32
|
||||
#define BLIS_PAGE_SIZE 4096
|
||||
|
||||
// Alignment size needed by the instruction set for aligned SIMD/vector
|
||||
// instructions.
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
// Alignment size used to align local stack buffers within macro-kernel
|
||||
// functions.
|
||||
#define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
// Basic (homogeneous) datatype support always enabled.
|
||||
|
||||
// Enable mixed domain operations?
|
||||
//#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
|
||||
// Enable extra mixed precision operations?
|
||||
//#define BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
|
||||
|
||||
|
||||
// -- MISCELLANEOUS OPTIONS ----------------------------------------------------
|
||||
|
||||
// Stay initialized after auto-initialization, unless and until the user
|
||||
// explicitly calls bli_finalize().
|
||||
#define BLIS_ENABLE_STAY_AUTO_INITIALIZED
|
||||
|
||||
|
||||
|
||||
// -- BLAS-to-BLIS COMPATIBILITY LAYER -----------------------------------------
|
||||
|
||||
// Enable the BLAS compatibility layer?
|
||||
#define BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
// The bit size of the integer type used to track values such as dimensions and
|
||||
// leading dimensions (ie: column strides) within the BLAS compatibility layer.
|
||||
// A value of 32 results in the compatibility layer using 32-bit signed integers
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
216
config/armv7a/bli_kernel.h
Normal file
216
config/armv7a/bli_kernel.h
Normal file
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 432
|
||||
#define BLIS_DEFAULT_KC_S 352
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 192
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 2
|
||||
#define BLIS_DEFAULT_NR_C 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 2
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
// -- copyv --
|
||||
|
||||
// -- dotv --
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
// -- invertv --
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
// -- scalv --
|
||||
|
||||
// -- setv --
|
||||
|
||||
// -- subv --
|
||||
|
||||
// -- swapv --
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
1
config/armv7a/kernels
Symbolic link
1
config/armv7a/kernels
Symbolic link
@@ -0,0 +1 @@
|
||||
../../kernels/armv7a
|
||||
108
config/armv7a/make_defs.mk
Normal file
108
config/armv7a/make_defs.mk
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
# Only include this block of code once.
|
||||
ifndef MAKE_DEFS_MK_INCLUDED
|
||||
MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := #-msse3 # -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS := $(CDBGFLAGS) $(COPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_KERNELS := $(CDBGFLAGS) $(CKOPTFLAGS) $(CVECFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
|
||||
endif
|
||||
@@ -36,6 +36,9 @@
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
#undef restrict
|
||||
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
@@ -118,10 +121,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -76,35 +76,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -122,10 +94,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -133,48 +129,22 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
@@ -182,44 +152,26 @@
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
#define BLIS_L1F_FUSE_FAC_S 8
|
||||
#define BLIS_L1F_FUSE_FAC_D 4
|
||||
#define BLIS_L1F_FUSE_FAC_C 4
|
||||
#define BLIS_L1F_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -229,16 +181,11 @@
|
||||
|
||||
#include "bli_gemm_8x8.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_8x8
|
||||
#define GEMM_UKERNEL_MT gemm_8x8_mt
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8
|
||||
#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -246,25 +193,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -272,25 +202,16 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -298,52 +219,30 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#include "bli_axpyv_opt_var1.h"
|
||||
|
||||
#define AXPYV_KERNEL axpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#include "bli_dotv_opt_var1.h"
|
||||
|
||||
#define DOTV_KERNEL dotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,27 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,16 +151,11 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -238,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -264,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -288,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,27 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,16 +151,11 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_4x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -238,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -264,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -288,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
// The maximum number of BLIS threads that will run concurrently.
|
||||
#define BLIS_MAX_NUM_THREADS 1
|
||||
#define BLIS_MAX_NUM_THREADS 24
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
// The number of MC x KC, KC x NC, and MC x NC blocks to reserve in the
|
||||
// contiguous memory pools.
|
||||
#define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS 1
|
||||
#define BLIS_NUM_KC_X_NC_BLOCKS 4
|
||||
#define BLIS_NUM_MC_X_NC_BLOCKS 0
|
||||
|
||||
// The maximum preload byte offset is used to pad the end of the contiguous
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -55,20 +55,63 @@
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 768
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#define BLIS_DEFAULT_KC_S 384
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 384
|
||||
#define BLIS_DEFAULT_KC_D 384
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
//#define BLIS_DEFAULT_MC_C 384
|
||||
//#define BLIS_DEFAULT_KC_C 384
|
||||
//#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
//#define BLIS_DEFAULT_MC_Z 192
|
||||
//#define BLIS_DEFAULT_KC_Z 384
|
||||
//#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// NOTE: If 4m blocksizes are not defined here, they will be determined
|
||||
// from the corresponding real domain blocksizes.
|
||||
#define BLIS_DEFAULT_4M_MC_C 384
|
||||
#define BLIS_DEFAULT_4M_KC_C 512
|
||||
#define BLIS_DEFAULT_4M_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_4M_MC_Z 192
|
||||
#define BLIS_DEFAULT_4M_KC_Z 256
|
||||
#define BLIS_DEFAULT_4M_NC_Z 4096
|
||||
|
||||
// NOTE: If 3m blocksizes are not defined here, they will be determined
|
||||
// from the corresponding real domain blocksizes.
|
||||
#define BLIS_DEFAULT_3M_MC_C 384
|
||||
#define BLIS_DEFAULT_3M_KC_C 512
|
||||
#define BLIS_DEFAULT_3M_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_3M_MC_Z 192
|
||||
#define BLIS_DEFAULT_3M_KC_Z 256
|
||||
#define BLIS_DEFAULT_3M_NC_Z 4096
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 2
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
@@ -78,48 +121,21 @@
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
//#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 4
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,234 +143,99 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_d4x4.h"
|
||||
|
||||
#include "bli_gemmtrsm_l_opt_d4x4.h"
|
||||
#include "bli_gemmtrsm_u_opt_d4x4.h"
|
||||
//#include "bli_gemmtrsm_l_ref_mxn.h"
|
||||
//#include "bli_gemmtrsm_u_ref_mxn.h"
|
||||
|
||||
//#include "bli_trsm_l_ref_4x4.h"
|
||||
//#include "bli_trsm_u_ref_4x4.h"
|
||||
#include "bli_trsm_l_ref_mxn.h"
|
||||
#include "bli_trsm_u_ref_mxn.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4
|
||||
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_4x4
|
||||
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_4x4
|
||||
|
||||
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
|
||||
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
#include "bli_axpy2v_opt_var1.h"
|
||||
#include "bli_dotaxpyv_opt_var1.h"
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
#include "bli_dotxf_opt_var1.h"
|
||||
#include "bli_dotxaxpyf_opt_var1.h"
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_opt_var1
|
||||
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
|
||||
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_opt_var1
|
||||
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
|
||||
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
#include "bli_axpyv_opt_var1.h"
|
||||
#include "bli_dotv_opt_var1.h"
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -80,10 +80,10 @@ CC := gcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99 -fopenmp #-pg
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -mfpmath=sse #-fomit-frame-pointer
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
CKOPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
CVECFLAGS := -msse3 -march=native
|
||||
|
||||
@@ -100,7 +100,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -lm
|
||||
LDFLAGS := -lm -fopenmp
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,111 +123,39 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_d4x4.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_d4x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D (BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D (BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 2
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,17 +152,10 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_30x8.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_30x8
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_CACHE_LINE_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,17 +152,10 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_4x6.h"
|
||||
|
||||
#define GEMM_UKERNEL gemm_4x6
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -116,10 +116,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 16
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,93 +123,28 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
@@ -221,18 +152,12 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
//#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
#include "bli_gemm_opt_8x4.h"
|
||||
#define GEMM_UKERNEL gemm_opt_8x4
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -240,25 +165,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -266,23 +174,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -290,48 +189,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -35,300 +35,8 @@
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
// NOTE: These register blocksize "extensions" determine whether the
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
// In the reference configuration, we let all of the defaults take
|
||||
// effect. Thus, no definitions are needed.
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
@@ -70,35 +70,7 @@
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +88,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
//#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
//#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
//#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
//#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,111 +123,39 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
// NOTE: These values determine high-level cache blocking for level-2
|
||||
// operations ONLY. So, if gemv is performed with a 2000x2000 matrix A and
|
||||
// MC = NC = 1000, then a total of four unblocked (or unblocked fused)
|
||||
// gemv subproblems are called. The blocked algorithms are only useful in
|
||||
// that they provide the opportunity for packing vectors. (Matrices can also
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_ref_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_ref_mxn
|
||||
|
||||
|
||||
|
||||
@@ -239,25 +163,8 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +172,14 @@
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_unb_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_unb_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_unb_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_unb_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_unb_var1
|
||||
|
||||
|
||||
|
||||
@@ -289,48 +187,26 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -118,10 +118,6 @@
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
// Alignment size used when sizing strides (eg: of packed micro-panels)
|
||||
// within a block of contiguous memory.
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE
|
||||
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
|
||||
@@ -38,9 +38,8 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Default cache blocksizes --
|
||||
// -- Cache blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
@@ -52,53 +51,24 @@
|
||||
// (3) KC must be a multiple of
|
||||
// (a) MR and
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 64
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
#define BLIS_DEFAULT_MC_S 128
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 128
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
#define BLIS_DEFAULT_MC_D 128
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 2048
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 128
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
#define BLIS_DEFAULT_MC_Z 128
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
#define BLIS_EXTEND_MC_S 0 //(BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_EXTEND_KC_S 0 //(BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_EXTEND_NC_S 0 //(BLIS_DEFAULT_NC_S/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_D 0 //(BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_EXTEND_KC_D 0 //(BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_EXTEND_NC_D 0 //(BLIS_DEFAULT_NC_D/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_C 0 //(BLIS_DEFAULT_MC_C/4)
|
||||
#define BLIS_EXTEND_KC_C 0 //(BLIS_DEFAULT_KC_C/4)
|
||||
#define BLIS_EXTEND_NC_C 0 //(BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
#define BLIS_EXTEND_MC_Z 0 //(BLIS_DEFAULT_MC_Z/4)
|
||||
#define BLIS_EXTEND_KC_Z 0 //(BLIS_DEFAULT_KC_Z/4)
|
||||
#define BLIS_EXTEND_NC_Z 0 //(BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
// -- Default register blocksizes for micro-kernel --
|
||||
|
||||
// NOTE: When using the reference configuration, these register blocksizes
|
||||
// in the m and n dimensions should all be equal to the size expected by
|
||||
// the reference micro-kernel(s).
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
@@ -116,10 +86,34 @@
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
#define BLIS_DEFAULT_KR_S 1
|
||||
#define BLIS_DEFAULT_KR_D 1
|
||||
#define BLIS_DEFAULT_KR_C 1
|
||||
#define BLIS_DEFAULT_KR_Z 1
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// non-zero, blocksizes used at edge cases are extended (enlarged) if
|
||||
// such an extension would encompass the remaining portion of the
|
||||
// matrix dimension.
|
||||
|
||||
//#define BLIS_EXTEND_MC_S 0
|
||||
//#define BLIS_EXTEND_KC_S 0
|
||||
//#define BLIS_EXTEND_NC_S 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_D 0
|
||||
//#define BLIS_EXTEND_KC_D 0
|
||||
//#define BLIS_EXTEND_NC_D 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_C 0
|
||||
//#define BLIS_EXTEND_KC_C 0
|
||||
//#define BLIS_EXTEND_NC_C 0
|
||||
|
||||
//#define BLIS_EXTEND_MC_Z 0
|
||||
//#define BLIS_EXTEND_KC_Z 0
|
||||
//#define BLIS_EXTEND_NC_Z 0
|
||||
|
||||
// -- Register blocksize extensions (for packed micro-panels) --
|
||||
|
||||
@@ -127,24 +121,52 @@
|
||||
// leading dimensions used within the packed micro-panels are equal to
|
||||
// or greater than their corresponding register blocksizes above.
|
||||
|
||||
#define BLIS_EXTEND_MR_S 0
|
||||
#define BLIS_EXTEND_NR_S 0
|
||||
//#define BLIS_EXTEND_MR_S 0
|
||||
//#define BLIS_EXTEND_NR_S 0
|
||||
|
||||
#define BLIS_EXTEND_MR_D 0
|
||||
#define BLIS_EXTEND_NR_D 0
|
||||
//#define BLIS_EXTEND_MR_D 0
|
||||
//#define BLIS_EXTEND_NR_D 0
|
||||
|
||||
#define BLIS_EXTEND_MR_C 0
|
||||
#define BLIS_EXTEND_NR_C 0
|
||||
//#define BLIS_EXTEND_MR_C 0
|
||||
//#define BLIS_EXTEND_NR_C 0
|
||||
|
||||
#define BLIS_EXTEND_MR_Z 0
|
||||
#define BLIS_EXTEND_NR_Z 0
|
||||
//#define BLIS_EXTEND_MR_Z 0
|
||||
//#define BLIS_EXTEND_NR_Z 0
|
||||
|
||||
// Register blocksize extensions in the k dimension are not used.
|
||||
|
||||
#define BLIS_EXTEND_KR_S 0
|
||||
#define BLIS_EXTEND_KR_D 0
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNELS ---------------------------------------------------
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_mxn
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_mxn
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_mxn
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_opt_mxn
|
||||
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_opt_mxn
|
||||
#define BLIS_CGEMMTRSM_L_UKERNEL bli_cgemmtrsm_l_opt_mxn
|
||||
#define BLIS_ZGEMMTRSM_L_UKERNEL bli_zgemmtrsm_l_opt_mxn
|
||||
|
||||
#define BLIS_SGEMMTRSM_U_UKERNEL bli_sgemmtrsm_u_opt_mxn
|
||||
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_opt_mxn
|
||||
#define BLIS_CGEMMTRSM_U_UKERNEL bli_cgemmtrsm_u_opt_mxn
|
||||
#define BLIS_ZGEMMTRSM_U_UKERNEL bli_zgemmtrsm_u_opt_mxn
|
||||
|
||||
#define BLIS_STRSM_L_UKERNEL bli_strsm_l_opt_mxn
|
||||
#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_opt_mxn
|
||||
#define BLIS_CTRSM_L_UKERNEL bli_ctrsm_l_opt_mxn
|
||||
#define BLIS_ZTRSM_L_UKERNEL bli_ztrsm_l_opt_mxn
|
||||
|
||||
#define BLIS_STRSM_U_UKERNEL bli_strsm_u_opt_mxn
|
||||
#define BLIS_DTRSM_U_UKERNEL bli_dtrsm_u_opt_mxn
|
||||
#define BLIS_CTRSM_U_UKERNEL bli_ctrsm_u_opt_mxn
|
||||
#define BLIS_ZTRSM_U_UKERNEL bli_ztrsm_u_opt_mxn
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -158,17 +180,18 @@
|
||||
// be packed here, but this tends to be much too expensive in practice to
|
||||
// actually employ.)
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_S 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_S 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_D 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_D 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
//#define BLIS_DEFAULT_L2_MC_C 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_C 1000
|
||||
|
||||
//#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
//#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
#define BLIS_DEFAULT_L2_MC_Z 1000
|
||||
#define BLIS_DEFAULT_L2_NC_Z 1000
|
||||
|
||||
|
||||
|
||||
@@ -176,66 +199,67 @@
|
||||
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
// NOTE: Default fusing factors are not used by the reference implementations
|
||||
// of level-1f operations. They are here only for use when these operations
|
||||
// are optimized.
|
||||
//#define BLIS_L1F_FUSE_FAC_S 8
|
||||
//#define BLIS_L1F_FUSE_FAC_D 4
|
||||
//#define BLIS_L1F_FUSE_FAC_C 4
|
||||
//#define BLIS_L1F_FUSE_FAC_Z 2
|
||||
|
||||
#define BLIS_DEFAULT_FUSE_FAC_S 8
|
||||
#define BLIS_DEFAULT_FUSE_FAC_D 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_C 4
|
||||
#define BLIS_DEFAULT_FUSE_FAC_Z 2
|
||||
//#define BLIS_AXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_AXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_AXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_AXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_AXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_AXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_AXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_AXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
//#define BLIS_DOTXF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_DOTXF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_DOTXF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_DOTXF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_DEFAULT_FUSE_FAC_S
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_DEFAULT_FUSE_FAC_D
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_DEFAULT_FUSE_FAC_C
|
||||
#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_DEFAULT_FUSE_FAC_Z
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_S BLIS_L1F_FUSE_FAC_S
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_D BLIS_L1F_FUSE_FAC_D
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_C BLIS_L1F_FUSE_FAC_C
|
||||
//#define BLIS_DOTXAXPYF_FUSE_FAC_Z BLIS_L1F_FUSE_FAC_Z
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
// -- Default register blocksizes for vectors --
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// NOTE: Register blocksizes for vectors are used when packing
|
||||
// non-contiguous vectors. Similar to that of KR, they can
|
||||
// typically be set to 1.
|
||||
// -- axpy2v --
|
||||
|
||||
#define BLIS_DEFAULT_VR_S 1
|
||||
#define BLIS_DEFAULT_VR_D 1
|
||||
#define BLIS_DEFAULT_VR_C 1
|
||||
#define BLIS_DEFAULT_VR_Z 1
|
||||
#define BLIS_SAXPY2V_KERNEL bli_saxpy2v_opt_var1
|
||||
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_opt_var1
|
||||
#define BLIS_CAXPY2V_KERNEL bli_caxpy2v_opt_var1
|
||||
#define BLIS_ZAXPY2V_KERNEL bli_zaxpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define BLIS_SDOTAXPYV_KERNEL bli_sdotaxpyv_opt_var1
|
||||
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_opt_var1
|
||||
#define BLIS_CDOTAXPYV_KERNEL bli_cdotaxpyv_opt_var1
|
||||
#define BLIS_ZDOTAXPYV_KERNEL bli_zdotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define BLIS_SAXPYF_KERNEL bli_saxpyf_opt_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
|
||||
#define BLIS_CAXPYF_KERNEL bli_caxpyf_opt_var1
|
||||
#define BLIS_ZAXPYF_KERNEL bli_zaxpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define BLIS_SDOTXF_KERNEL bli_sdotxf_opt_var1
|
||||
#define BLIS_DDOTXF_KERNEL bli_ddotxf_opt_var1
|
||||
#define BLIS_CDOTXF_KERNEL bli_cdotxf_opt_var1
|
||||
#define BLIS_ZDOTXF_KERNEL bli_zdotxf_opt_var1
|
||||
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
#define BLIS_SDOTXAXPYF_KERNEL bli_sdotxaxpyf_opt_var1
|
||||
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_opt_var1
|
||||
#define BLIS_CDOTXAXPYF_KERNEL bli_cdotxaxpyf_opt_var1
|
||||
#define BLIS_ZDOTXAXPYF_KERNEL bli_zdotxaxpyf_opt_var1
|
||||
|
||||
#include "bli_gemm_opt_mxn.h"
|
||||
#include "bli_trsm_l_opt_mxn.h"
|
||||
#include "bli_trsm_u_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_l_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_u_opt_mxn.h"
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_mxn
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_mxn
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_mxn
|
||||
|
||||
#define TRSM_L_UKERNEL trsm_l_opt_mxn
|
||||
#define TRSM_U_UKERNEL trsm_u_opt_mxn
|
||||
|
||||
|
||||
|
||||
@@ -243,55 +267,30 @@
|
||||
|
||||
// -- packm --
|
||||
|
||||
#define PACKM_2XK_KERNEL packm_ref_2xk
|
||||
#define PACKM_4XK_KERNEL packm_ref_4xk
|
||||
#define PACKM_6XK_KERNEL packm_ref_6xk
|
||||
#define PACKM_8XK_KERNEL packm_ref_8xk
|
||||
#define PACKM_10XK_KERNEL packm_ref_10xk
|
||||
#define PACKM_12XK_KERNEL packm_ref_12xk
|
||||
#define PACKM_14XK_KERNEL packm_ref_14xk
|
||||
#define PACKM_16XK_KERNEL packm_ref_16xk
|
||||
//#define BLIS_SPACKM_2XK_KERNEL bli_spackm_ref_2xk
|
||||
//#define BLIS_DPACKM_2XK_KERNEL bli_dpackm_ref_2xk
|
||||
//#define BLIS_CPACKM_2XK_KERNEL bli_cpackm_ref_2xk
|
||||
//#define BLIS_ZPACKM_2XK_KERNEL bli_zpackm_ref_2xk
|
||||
|
||||
// -- unpackm --
|
||||
//#define BLIS_SPACKM_4XK_KERNEL bli_spackm_ref_4xk
|
||||
//#define BLIS_DPACKM_4XK_KERNEL bli_dpackm_ref_4xk
|
||||
//#define BLIS_CPACKM_4XK_KERNEL bli_cpackm_ref_4xk
|
||||
//#define BLIS_ZPACKM_4XK_KERNEL bli_zpackm_ref_4xk
|
||||
|
||||
#define UNPACKM_2XK_KERNEL unpackm_ref_2xk
|
||||
#define UNPACKM_4XK_KERNEL unpackm_ref_4xk
|
||||
#define UNPACKM_6XK_KERNEL unpackm_ref_6xk
|
||||
#define UNPACKM_8XK_KERNEL unpackm_ref_8xk
|
||||
#define UNPACKM_10XK_KERNEL unpackm_ref_10xk
|
||||
#define UNPACKM_12XK_KERNEL unpackm_ref_12xk
|
||||
#define UNPACKM_14XK_KERNEL unpackm_ref_14xk
|
||||
#define UNPACKM_16XK_KERNEL unpackm_ref_16xk
|
||||
//#define BLIS_SPACKM_6XK_KERNEL bli_spackm_ref_6xk
|
||||
//#define BLIS_DPACKM_6XK_KERNEL bli_dpackm_ref_6xk
|
||||
//#define BLIS_CPACKM_6XK_KERNEL bli_cpackm_ref_6xk
|
||||
//#define BLIS_ZPACKM_6XK_KERNEL bli_zpackm_ref_6xk
|
||||
|
||||
//#define BLIS_SPACKM_8XK_KERNEL bli_spackm_ref_8xk
|
||||
//#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_ref_8xk
|
||||
//#define BLIS_CPACKM_8XK_KERNEL bli_cpackm_ref_8xk
|
||||
//#define BLIS_ZPACKM_8XK_KERNEL bli_zpackm_ref_8xk
|
||||
|
||||
// ...
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
// (Commented definitions for 10, 12, 14, and 16 not shown).
|
||||
|
||||
#include "bli_axpy2v_opt_var1.h"
|
||||
#include "bli_dotaxpyv_opt_var1.h"
|
||||
#include "bli_axpyf_opt_var1.h"
|
||||
#include "bli_dotxf_opt_var1.h"
|
||||
#include "bli_dotxaxpyf_opt_var1.h"
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
#define AXPY2V_KERNEL axpy2v_opt_var1
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
#define DOTAXPYV_KERNEL dotaxpyv_opt_var1
|
||||
|
||||
// -- axpyf --
|
||||
|
||||
#define AXPYF_KERNEL axpyf_opt_var1
|
||||
|
||||
// -- dotxf --
|
||||
|
||||
#define DOTXF_KERNEL dotxf_opt_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
#define DOTXAXPYF_KERNEL dotxaxpyf_opt_var1
|
||||
|
||||
|
||||
|
||||
@@ -299,47 +298,81 @@
|
||||
|
||||
// -- addv --
|
||||
|
||||
#define ADDV_KERNEL addv_unb_var1
|
||||
//#define BLIS_SADDV_KERNEL bli_saddv_unb_var1
|
||||
//#define BLIS_DADDV_KERNEL bli_daddv_unb_var1
|
||||
//#define BLIS_CADDV_KERNEL bli_caddv_unb_var1
|
||||
//#define BLIS_ZADDV_KERNEL bli_zaddv_unb_var1
|
||||
|
||||
// -- axpyv --
|
||||
|
||||
#define AXPYV_KERNEL axpyv_unb_var1
|
||||
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var1
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
|
||||
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt_var1
|
||||
#define BLIS_ZAXPYV_KERNEL bli_zaxpyv_opt_var1
|
||||
|
||||
// -- copyv --
|
||||
|
||||
#define COPYV_KERNEL copyv_unb_var1
|
||||
//#define BLIS_SCOPYV_KERNEL bli_scopyv_unb_var1
|
||||
//#define BLIS_DCOPYV_KERNEL bli_dcopyv_unb_var1
|
||||
//#define BLIS_CCOPYV_KERNEL bli_ccopyv_unb_var1
|
||||
//#define BLIS_ZCOPYV_KERNEL bli_zcopyv_unb_var1
|
||||
|
||||
// -- dotv --
|
||||
|
||||
#define DOTV_KERNEL dotv_unb_var1
|
||||
#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
#define BLIS_CDOTV_KERNEL bli_cdotv_opt_var1
|
||||
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
|
||||
#define DOTXV_KERNEL dotxv_unb_var1
|
||||
//#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1
|
||||
//#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1
|
||||
//#define BLIS_CDOTXV_KERNEL bli_cdotxv_unb_var1
|
||||
//#define BLIS_ZDOTXV_KERNEL bli_zdotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
#define INVERTV_KERNEL invertv_unb_var1
|
||||
//#define BLIS_SINVERTV_KERNEL bli_sinvertv_unb_var1
|
||||
//#define BLIS_DINVERTV_KERNEL bli_dinvertv_unb_var1
|
||||
//#define BLIS_CINVERTV_KERNEL bli_cinvertv_unb_var1
|
||||
//#define BLIS_ZINVERTV_KERNEL bli_zinvertv_unb_var1
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
#define SCAL2V_KERNEL scal2v_unb_var1
|
||||
//#define BLIS_SSCAL2V_KERNEL bli_sscal2v_unb_var1
|
||||
//#define BLIS_DSCAL2V_KERNEL bli_dscal2v_unb_var1
|
||||
//#define BLIS_CSCAL2V_KERNEL bli_cscal2v_unb_var1
|
||||
//#define BLIS_ZSCAL2V_KERNEL bli_zscal2v_unb_var1
|
||||
|
||||
// -- scalv --
|
||||
|
||||
#define SCALV_KERNEL scalv_unb_var1
|
||||
//#define BLIS_SSCALV_KERNEL bli_sscalv_unb_var1
|
||||
//#define BLIS_DSCALV_KERNEL bli_dscalv_unb_var1
|
||||
//#define BLIS_CSCALV_KERNEL bli_cscalv_unb_var1
|
||||
//#define BLIS_ZSCALV_KERNEL bli_zscalv_unb_var1
|
||||
|
||||
// -- setv --
|
||||
|
||||
#define SETV_KERNEL setv_unb_var1
|
||||
//#define BLIS_SSETV_KERNEL bli_ssetv_unb_var1
|
||||
//#define BLIS_DSETV_KERNEL bli_dsetv_unb_var1
|
||||
//#define BLIS_CSETV_KERNEL bli_csetv_unb_var1
|
||||
//#define BLIS_ZSETV_KERNEL bli_zsetv_unb_var1
|
||||
|
||||
// -- subv --
|
||||
|
||||
#define SUBV_KERNEL subv_unb_var1
|
||||
//#define BLIS_SSUBV_KERNEL bli_ssubv_unb_var1
|
||||
//#define BLIS_DSUBV_KERNEL bli_dsubv_unb_var1
|
||||
//#define BLIS_CSUBV_KERNEL bli_csubv_unb_var1
|
||||
//#define BLIS_ZSUBV_KERNEL bli_zsubv_unb_var1
|
||||
|
||||
// -- swapv --
|
||||
|
||||
#define SWAPV_KERNEL swapv_unb_var1
|
||||
//#define BLIS_SSWAPV_KERNEL bli_sswapv_unb_var1
|
||||
//#define BLIS_DSWAPV_KERNEL bli_dswapv_unb_var1
|
||||
//#define BLIS_CSWAPV_KERNEL bli_cswapv_unb_var1
|
||||
//#define BLIS_ZSWAPV_KERNEL bli_zswapv_unb_var1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -36,59 +36,59 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy )
|
||||
void bli_saxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_SAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy )
|
||||
void bli_daxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_DAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy )
|
||||
void bli_caxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_CAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy )
|
||||
void bli_zaxpyv_opt_var1( conj_t conjx,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy )
|
||||
{
|
||||
/*
|
||||
Template axpyv kernel implementation
|
||||
@@ -193,11 +193,11 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_ZAXPYV_KERNEL_REF( conjx,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -272,37 +272,3 @@ void bli_zzzaxpyv_opt_var1( conj_t conjx,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,opname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,varname)( conjx, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_opt_var1, axpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,66 +36,66 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho )
|
||||
void bli_sdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_SDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho )
|
||||
void bli_ddotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_DDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho )
|
||||
void bli_cdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_CDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho )
|
||||
void bli_zdotv_opt_var1( conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho )
|
||||
{
|
||||
/*
|
||||
Template dotv kernel implementation
|
||||
@@ -210,12 +210,12 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
BLIS_ZDOTV_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -310,36 +310,3 @@ void bli_zzzdotv_opt_var1( conj_t conjx,
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,opname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chr,varname)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv_opt_var1, dotv_unb_var1 )
|
||||
#endif
|
||||
|
||||
@@ -36,88 +36,88 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha1,
|
||||
float* restrict alpha2,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz
|
||||
)
|
||||
void bli_saxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha1,
|
||||
float* restrict alpha2,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_SAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha1,
|
||||
double* restrict alpha2,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz
|
||||
)
|
||||
void bli_daxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha1,
|
||||
double* restrict alpha2,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_DAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha1,
|
||||
scomplex* restrict alpha2,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz
|
||||
)
|
||||
void bli_caxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha1,
|
||||
scomplex* restrict alpha2,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_CAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha1,
|
||||
dcomplex* restrict alpha2,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz
|
||||
)
|
||||
void bli_zaxpy2v_opt_var1(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha1,
|
||||
dcomplex* restrict alpha2,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpy2v kernel implementation
|
||||
@@ -229,14 +229,14 @@ void bli_zzzaxpy2v_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_ZAXPY2V_KERNEL_REF( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha1,
|
||||
alpha2,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -396,41 +396,3 @@ void bli_zzzaxpy2v_opt_var1(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha1, \
|
||||
ctype_xy* restrict alpha2, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha1, \
|
||||
alpha2, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpy2v_opt_var1, axpy2v_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,87 +36,87 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
void bli_saxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_SAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dddaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
void bli_daxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_dddaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_DAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_caxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_CAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
void bli_zzzaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_zaxpyf_opt_var1(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template axpyf kernel implementation
|
||||
@@ -243,14 +243,14 @@ void bli_zzzaxpyf_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzaxpyf_unb_var1( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
BLIS_ZAXPYF_KERNEL_REF( conja,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -376,41 +376,3 @@ void bli_zzzaxpyf_opt_var1(
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conja, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ax* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conja, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( axpyf_opt_var1, axpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,87 +36,87 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
float* restrict z, inc_t incz )
|
||||
void bli_sdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_SDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
double* restrict z, inc_t incz )
|
||||
void bli_ddotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_DDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
void bli_cdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict rho,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_CDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
void bli_zdotaxpyv_opt_var1( conj_t conjxt,
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict rho,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/*
|
||||
Template dotaxpyv kernel implementation
|
||||
@@ -240,15 +240,15 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
BLIS_ZDOTAXPYV_KERNEL_REF( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -429,42 +429,3 @@ void bli_zzzdotaxpyv_opt_var1( conj_t conjxt,
|
||||
bli_zzcopys( dotxy, *rho );
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chz,varname)( \
|
||||
conj_t conjxt, \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_xy* restrict rho, \
|
||||
ctype_z* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(chx,chy,chz,kername)( conjxt, \
|
||||
conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotaxpyv_opt_var1, dotaxpyv_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -36,115 +36,115 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict w, inc_t incw,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz )
|
||||
void bli_sdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict w, inc_t incw,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_SDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict w, inc_t incw,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz )
|
||||
void bli_ddotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict w, inc_t incw,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_DDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict w, inc_t incw,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
void bli_cdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict w, inc_t incw,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy,
|
||||
scomplex* restrict z, inc_t incz )
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_CDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict w, inc_t incw,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
void bli_zdotxaxpyf_opt_var1( conj_t conjat,
|
||||
conj_t conja,
|
||||
conj_t conjw,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict w, inc_t incw,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy,
|
||||
dcomplex* restrict z, inc_t incz )
|
||||
|
||||
{
|
||||
/*
|
||||
@@ -289,19 +289,19 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxaxpyf_unb_var1( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
BLIS_ZDOTXAXPYF_KERNEL_REF( conjat,
|
||||
conja,
|
||||
conjw,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
w, incw,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -560,51 +560,3 @@ void bli_zzzdotxaxpyf_opt_var1( conj_t conjat,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conja, \
|
||||
conjw, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
w, incw, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy, \
|
||||
z, incz ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxaxpyf_opt_var1, dotxaxpyf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chb,chc,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conja, \
|
||||
conj_t conjw, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_ab* restrict alpha, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_b* restrict w, inc_t incw, \
|
||||
ctype_b* restrict x, inc_t incx, \
|
||||
ctype_c* restrict beta, \
|
||||
ctype_c* restrict y, inc_t incy, \
|
||||
ctype_c* restrict z, inc_t incz \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxaxpyf_opt_var1 )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxaxpyf_opt_var1 )
|
||||
#endif
|
||||
|
||||
@@ -36,95 +36,95 @@
|
||||
|
||||
|
||||
|
||||
void bli_sssdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
void bli_sdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sssdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_SDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ddddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
void bli_ddotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ddddotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_DDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_cccdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_cdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a, inc_t inca, inc_t lda,
|
||||
scomplex* restrict x, inc_t incx,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cccdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_CDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_zzzdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
void bli_zdotxf_opt_var1(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a, inc_t inca, inc_t lda,
|
||||
dcomplex* restrict x, inc_t incx,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict y, inc_t incy
|
||||
)
|
||||
{
|
||||
/*
|
||||
Template dotxf kernel implementation
|
||||
@@ -265,15 +265,15 @@ void bli_zzzdotxf_opt_var1(
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_zzzdotxf_unb_var1( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
BLIS_ZDOTXF_KERNEL_REF( conjat,
|
||||
conjx,
|
||||
m,
|
||||
b_n,
|
||||
alpha,
|
||||
a, inca, lda,
|
||||
x, incx,
|
||||
beta,
|
||||
y, incy );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -414,43 +414,3 @@ void bli_zzzdotxf_opt_var1(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with heterogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjat, \
|
||||
conj_t conjx, \
|
||||
dim_t m, \
|
||||
dim_t b_n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_y* restrict x, inc_t incx, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
/* Just call the reference implementation. */ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjat, \
|
||||
conjx, \
|
||||
m, \
|
||||
b_n, \
|
||||
alpha, \
|
||||
a, inca, lda, \
|
||||
x, incx, \
|
||||
beta, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxf_opt_var1, dotxf_unb_var1 )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ void bli_sgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_sgemm_ref_mxn( k,
|
||||
BLIS_SGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
@@ -162,8 +162,7 @@ void bli_dgemm_opt_mxn(
|
||||
that exist (at the edges) is handled automatically within the
|
||||
macro-kernel.
|
||||
- Alignment of a1 and b1. The addresses a1 and b1 are aligned according
|
||||
to the alignment value BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the
|
||||
bli_config.h header file of the BLIS configuration.
|
||||
to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively.
|
||||
- Unrolling loops. As a general rule of thumb, the loop over k is
|
||||
sometimes moderately unrolled; for example, in our experience, an
|
||||
unrolling factor of u = 4 is fairly common. If unrolling is applied
|
||||
@@ -275,7 +274,7 @@ void bli_cgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_cgemm_ref_mxn( k,
|
||||
BLIS_CGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
@@ -297,7 +296,7 @@ void bli_zgemm_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_zgemm_ref_mxn( k,
|
||||
BLIS_ZGEMM_UKERNEL_REF( k,
|
||||
alpha,
|
||||
a1,
|
||||
b1,
|
||||
|
||||
@@ -166,16 +166,8 @@ void bli_dgemmtrsm_l_opt_mxn(
|
||||
- Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation
|
||||
Notes for gemm.
|
||||
- Edge cases in MR, NR dimensions. See Implementation Notes for gemm.
|
||||
- Alignment of a1 and b1. Unlike with gemm, the addresses a10/a12 and a11
|
||||
are not guaranteed to be aligned according to the alignment value
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the bli_config.h header
|
||||
file. This is because these micro-panels may vary in size due to the
|
||||
triangular nature of matrix A. Instead, these addresses are aligned
|
||||
to PACKMR x sizeof(type), where type is the datatype in question. To
|
||||
support a somewhat obscure, higher-level optimization, we similarly
|
||||
do not guarantee that b01/b21 and b11 are aligned to
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE; instead, they are only aligned to
|
||||
PACKNR x sizeof(type).
|
||||
- Alignment of a1 and b1. The addresses a1 and b1 are aligned according
|
||||
to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively.
|
||||
- Unrolling loops. Most optimized implementations should unroll all
|
||||
three loops within the trsm subproblem of gemmtrsm. See Implementation
|
||||
Notes for gemm for remarks on unrolling the gemm subproblem.
|
||||
|
||||
@@ -164,16 +164,8 @@ void bli_dgemmtrsm_u_opt_mxn(
|
||||
- Leading dimensions of a1 and b1: PACKMR and PACKNR. See Implementation
|
||||
Notes for gemm.
|
||||
- Edge cases in MR, NR dimensions. See Implementation Notes for gemm.
|
||||
- Alignment of a1 and b1. Unlike with gemm, the addresses a10/a12 and a11
|
||||
are not guaranteed to be aligned according to the alignment value
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE, as defined in the bli_config.h header
|
||||
file. This is because these micro-panels may vary in size due to the
|
||||
triangular nature of matrix A. Instead, these addresses are aligned
|
||||
to PACKMR x sizeof(type), where type is the datatype in question. To
|
||||
support a somewhat obscure, higher-level optimization, we similarly
|
||||
do not guarantee that b01/b21 and b11 are aligned to
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE; instead, they are only aligned to
|
||||
PACKNR x sizeof(type).
|
||||
- Alignment of a1 and b1. The addresses a1 and b1 are aligned according
|
||||
to PACKMR*sizeof(type) and PACKNR*sizeof(type), respectively.
|
||||
- Unrolling loops. Most optimized implementations should unroll all
|
||||
three loops within the trsm subproblem of gemmtrsm. See Implementation
|
||||
Notes for gemm for remarks on unrolling the gemm subproblem.
|
||||
|
||||
@@ -44,7 +44,7 @@ void bli_strsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_l_ref_mxn( a11,
|
||||
BLIS_STRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
@@ -216,7 +216,7 @@ void bli_ctrsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_l_ref_mxn( a11,
|
||||
BLIS_CTRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
@@ -232,7 +232,7 @@ void bli_ztrsm_l_opt_mxn(
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_l_ref_mxn( a11,
|
||||
BLIS_ZTRSM_L_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
|
||||
@@ -37,25 +37,25 @@
|
||||
|
||||
|
||||
void bli_strsm_u_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a11,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_STRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_dtrsm_u_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a11,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
@@ -168,18 +168,18 @@ void bli_dtrsm_u_opt_mxn(
|
||||
{
|
||||
i = m - iter - 1;
|
||||
n_behind = iter;
|
||||
alpha11 = a + (i )*rs_a + (i )*cs_a;
|
||||
a12t = a + (i )*rs_a + (i+1)*cs_a;
|
||||
x1 = b + (i )*rs_b + (0 )*cs_b;
|
||||
X2 = b + (i+1)*rs_b + (0 )*cs_b;
|
||||
alpha11 = a11 + (i )*rs_a + (i )*cs_a;
|
||||
a12t = a11 + (i )*rs_a + (i+1)*cs_a;
|
||||
x1 = b11 + (i )*rs_b + (0 )*cs_b;
|
||||
X2 = b11 + (i+1)*rs_b + (0 )*cs_b;
|
||||
|
||||
/* x1 = x1 - a12t * X2; */
|
||||
/* x1 = x1 / alpha11; */
|
||||
for ( j = 0; j < n; ++j )
|
||||
{
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
x21 = X2 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c + (i )*rs_c + (j )*cs_c;
|
||||
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
|
||||
x21 = X2 + (0 )*rs_b + (j )*cs_b;
|
||||
gamma11 = c11 + (i )*rs_c + (j )*cs_c;
|
||||
|
||||
/* chi11 = chi11 - a12t * x21; */
|
||||
bli_dset0s( rho11 );
|
||||
@@ -208,32 +208,32 @@ void bli_dtrsm_u_opt_mxn(
|
||||
|
||||
|
||||
void bli_ctrsm_u_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_CTRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
|
||||
void bli_ztrsm_u_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_u_ref_mxn( a,
|
||||
b,
|
||||
c, rs_c, cs_c,
|
||||
BLIS_ZTRSM_U_UKERNEL_REF( a11,
|
||||
b11,
|
||||
c11, rs_c, cs_c,
|
||||
data );
|
||||
}
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( addv, ADDV_KERNEL )
|
||||
GENFRONT( addv, addv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_addv_check.h"
|
||||
#include "bli_addv_unb_var1.h"
|
||||
#include "bli_addv_kernel.h"
|
||||
#include "bli_addv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
115
frame/1/addv/bli_addv_kernel.c
Normal file
115
frame/1/addv/bli_addv_kernel.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T addv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_addv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( addv_kernel_void, ADDV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( addv_kernel_void, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( addv_kernel_void, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_copyv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_addv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -46,12 +50,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( addv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( addv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( copyv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( addv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T addv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,addv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,addv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,addv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_addv_unb_var1( obj_t* x,
|
||||
void bli_addv_ref( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -83,17 +84,19 @@ void bli_addv_unb_var1( obj_t* x,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( addv, addv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( addv, addv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( addv, addv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( addv, addv_ref )
|
||||
#endif
|
||||
|
||||
60
frame/1/addv/bli_addv_ref.h
Normal file
60
frame/1/addv/bli_addv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_addv_ref( obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( addv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( addv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( addv_ref )
|
||||
#endif
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( axpyv, AXPYV_KERNEL )
|
||||
GENFRONT( axpyv, axpyv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_axpyv_check.h"
|
||||
#include "bli_axpyv_unb_var1.h"
|
||||
#include "bli_axpyv_kernel.h"
|
||||
#include "bli_axpyv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
128
frame/1/axpyv/bli_axpyv_kernel.c
Normal file
128
frame/1/axpyv/bli_axpyv_kernel.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T axpyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpyv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If alpha is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_x, dt_alpha, buf_alpha );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_alpha][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(cha,chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_kernel_void, AXPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_axpyv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
|
||||
\
|
||||
@@ -48,13 +52,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( axpyv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T axpyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -47,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,axpyv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,axpyv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,axpyv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
void bli_axpyv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
@@ -94,18 +94,19 @@ void bli_axpyv_unb_var1( obj_t* alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname, addvker ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC3(cha,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_a* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -156,13 +157,13 @@ void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_BASIC( axpyv_ref, ADDV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_D( axpyv_ref, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_unb_var1, ADDV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_P( axpyv_ref, ADDV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,28 +32,32 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_axpyv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype axpyv kernel interfaces.
|
||||
//
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(cha,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
void PASTEMAC3(cha,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_a* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_BASIC( axpyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( axpyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_opt_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( axpyv_ref )
|
||||
#endif
|
||||
|
||||
@@ -34,16 +34,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
void bli_copyv( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_copyv_check( x, y );
|
||||
|
||||
bli_copyv_unb_var1( x, y );
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
@@ -63,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( copyv, COPYV_KERNEL )
|
||||
GENFRONT( copyv, copyv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_copyv_check.h"
|
||||
#include "bli_copyv_unb_var1.h"
|
||||
#include "bli_copyv_kernel.h"
|
||||
#include "bli_copyv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
115
frame/1/copyv/bli_copyv_kernel.c
Normal file
115
frame/1/copyv/bli_copyv_kernel.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T copyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_copyv_kernel( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( copyv_kernel_void, COPYV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( copyv_kernel_void, COPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( copyv_kernel_void, COPYV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_addv_unb_var1( obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_copyv_kernel( obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
@@ -46,12 +50,12 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( copyv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( copyv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( addv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( copyv_kernel_void )
|
||||
#endif
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T copyv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,copyv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,copyv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,copyv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_copyv_unb_var1( obj_t* x,
|
||||
void bli_copyv_ref( obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -83,17 +84,19 @@ void bli_copyv_unb_var1( obj_t* x,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -130,13 +133,13 @@ void PASTEMAC2(chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( copyv, copyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( copyv, copyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( copyv, copyv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( copyv, copyv_ref )
|
||||
#endif
|
||||
|
||||
60
frame/1/copyv/bli_copyv_ref.h
Normal file
60
frame/1/copyv/bli_copyv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_copyv_ref( obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( copyv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( copyv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( copyv_ref )
|
||||
#endif
|
||||
@@ -55,7 +55,7 @@ void PASTEMAC0(opname)( \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotv, DOTV_KERNEL )
|
||||
GENFRONT( dotv, dotv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_dotv_check.h"
|
||||
#include "bli_dotv_unb_var1.h"
|
||||
#include "bli_dotv_kernel.h"
|
||||
#include "bli_dotv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
128
frame/1/dotv/bli_dotv_kernel.c
Normal file
128
frame/1/dotv/bli_dotv_kernel.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T dotv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotv_kernel( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* rho \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chx,chy,chr,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( dotv_kernel_void, DOTV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv_kernel_void, DOTV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv_kernel_void, DOTV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotv_unb_var1( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
void bli_dotv_kernel( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
@@ -49,13 +53,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void* rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( dotv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( dotv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( dotv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -48,17 +49,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotv_unb_var1( obj_t* x,
|
||||
void bli_dotv_ref( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho )
|
||||
{
|
||||
@@ -92,19 +93,20 @@ void bli_dotv_unb_var1( obj_t* x,
|
||||
buf_y, inc_y,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_x, ctype_y, ctype_r, chx, chy, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* rho \
|
||||
) \
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
ctype_x* x_cast = x; \
|
||||
ctype_y* y_cast = y; \
|
||||
@@ -163,13 +165,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_BASIC( dotv, dotv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_MIX_D( dotv, dotv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( dotv, dotv_unb_var1 )
|
||||
INSERT_GENTFUNC3_MIX_P( dotv, dotv_ref )
|
||||
#endif
|
||||
|
||||
64
frame/1/dotv/bli_dotv_ref.h
Normal file
64
frame/1/dotv/bli_dotv_ref.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_dotv_ref( obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* rho );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( dotv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( dotv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( dotv_ref )
|
||||
#endif
|
||||
|
||||
@@ -59,7 +59,7 @@ void PASTEMAC0(opname)( \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
GENFRONT( dotxv, DOTXV_KERNEL )
|
||||
GENFRONT( dotxv, dotxv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_dotxv_check.h"
|
||||
#include "bli_dotxv_unb_var1.h"
|
||||
#include "bli_dotxv_kernel.h"
|
||||
#include "bli_dotxv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
153
frame/1/dotxv/bli_dotxv_kernel.c
Normal file
153
frame/1/dotxv/bli_dotxv_kernel.c
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T dotxv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
void* alpha,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* beta,
|
||||
void* rho
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
num_t dt_rho = bli_obj_datatype( *rho );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
conj_t conjy = bli_obj_conj_status( *y );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
void* buf_rho = bli_obj_buffer_at_off( *rho );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// The datatype of alpha MUST be the type union of x and y. This is to
|
||||
// prevent any unnecessary loss of information during computation.
|
||||
dt_alpha = bli_datatype_union( dt_x, dt_y );
|
||||
buf_alpha = bli_obj_buffer_for_1x1( dt_alpha, *alpha );
|
||||
|
||||
// The datatype of beta MUST be the same as the datatype of rho.
|
||||
dt_beta = dt_rho;
|
||||
buf_beta = bli_obj_buffer_for_1x1( dt_beta, *beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x][dt_y][dt_rho];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
conjy,
|
||||
n,
|
||||
buf_alpha,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* beta, \
|
||||
void* rho \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chx,chy,chr,kername)( conjx, \
|
||||
conjy, \
|
||||
n, \
|
||||
alpha, \
|
||||
x, incx, \
|
||||
y, incy, \
|
||||
beta, \
|
||||
rho ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv_kernel_void, DOTXV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,13 +32,17 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
void bli_dotxv_kernel( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
\
|
||||
@@ -53,13 +57,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
void* rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_unb_var1 )
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T dotxv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -50,17 +51,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,dotxv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,dotxv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,dotxv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
void bli_dotxv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
@@ -113,21 +114,23 @@ void bli_dotxv_unb_var1( obj_t* alpha,
|
||||
buf_beta,
|
||||
buf_rho );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC3U12
|
||||
#define GENTFUNC3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
void* alpha, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy, \
|
||||
void* beta, \
|
||||
void* rho \
|
||||
) \
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict rho \
|
||||
) \
|
||||
{ \
|
||||
ctype_xy* alpha_cast = alpha; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -194,13 +197,13 @@ void PASTEMAC3(chx,chy,chr,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_BASIC( dotxv, dotxv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_D( dotxv, dotxv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_unb_var1 )
|
||||
INSERT_GENTFUNC3U12_MIX_P( dotxv, dotxv_ref )
|
||||
#endif
|
||||
|
||||
68
frame/1/dotxv/bli_dotxv_ref.h
Normal file
68
frame/1/dotxv/bli_dotxv_ref.h
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_dotxv_ref( obj_t* alpha,
|
||||
obj_t* x,
|
||||
obj_t* y,
|
||||
obj_t* beta,
|
||||
obj_t* rho );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3U12
|
||||
#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chx,chy,chr,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
conj_t conjy, \
|
||||
dim_t n, \
|
||||
ctype_xy* restrict alpha, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy, \
|
||||
ctype_r* restrict beta, \
|
||||
ctype_r* restrict rho \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3U12_BASIC( dotxv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_D( dotxv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3U12_MIX_P( dotxv_ref )
|
||||
#endif
|
||||
|
||||
@@ -34,15 +34,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
void bli_invertv( obj_t* x )
|
||||
{
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_invertv_check( x );
|
||||
|
||||
bli_invertv_unb_var1( x );
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
@@ -60,7 +51,7 @@ void PASTEMAC0(opname)( \
|
||||
PASTEMAC0(varname)( x ); \
|
||||
}
|
||||
|
||||
GENFRONT( invertv, INVERTV_KERNEL )
|
||||
GENFRONT( invertv, invertv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_invertv_check.h"
|
||||
#include "bli_invertv_unb_var1.h"
|
||||
#include "bli_invertv_kernel.h"
|
||||
#include "bli_invertv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
81
frame/1/invertv/bli_invertv_kernel.c
Normal file
81
frame/1/invertv/bli_invertv_kernel.c
Normal file
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T invertv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_kernel_void);
|
||||
|
||||
|
||||
void bli_invertv_kernel( obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( n,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,kername)( n, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( invertv_kernel_void, INVERTV_KERNEL )
|
||||
|
||||
@@ -32,9 +32,13 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_invertv_unb_var1( obj_t* x );
|
||||
void bli_invertv_kernel( obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
@@ -43,5 +47,5 @@ void PASTEMAC(ch,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( invertv_unb_var1 )
|
||||
INSERT_GENTPROT_BASIC( invertv_kernel_void )
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T invertv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -41,10 +42,10 @@ typedef void (*FUNCPTR_T)(
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY(ftypes,invertv_ref);
|
||||
|
||||
|
||||
void bli_invertv_unb_var1( obj_t* x )
|
||||
void bli_invertv_ref( obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
@@ -63,15 +64,17 @@ void bli_invertv_unb_var1( obj_t* x )
|
||||
f( n,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype* x_cast = x; \
|
||||
ctype* chi1; \
|
||||
@@ -89,5 +92,5 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( invertv, invertv_unb_var1 )
|
||||
INSERT_GENTFUNC_BASIC( invertv, invertv_ref )
|
||||
|
||||
@@ -32,16 +32,19 @@
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_invertv_ref( obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trsm_l_ref_4x4 )
|
||||
INSERT_GENTPROT_BASIC( invertv_ref )
|
||||
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( scal2v, SCAL2V_KERNEL )
|
||||
GENFRONT( scal2v, scal2v_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
#include "bli_scal2v_check.h"
|
||||
#include "bli_scal2v_unb_var1.h"
|
||||
#include "bli_scal2v_kernel.h"
|
||||
#include "bli_scal2v_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
129
frame/1/scal2v/bli_scal2v_kernel.c
Normal file
129
frame/1/scal2v/bli_scal2v_kernel.c
Normal file
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T scal2v_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scal2v_kernel( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
num_t dt_y = bli_obj_datatype( *y );
|
||||
|
||||
conj_t conjx = bli_obj_conj_status( *x );
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
inc_t inc_y = bli_obj_vector_inc( *y );
|
||||
void* buf_y = bli_obj_buffer_at_off( *y );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x][dt_y];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjx,
|
||||
n,
|
||||
buf_beta,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC3(chb,chx,chy,kername)( conjx, \
|
||||
n, \
|
||||
beta, \
|
||||
x, incx, \
|
||||
y, incy ); \
|
||||
}
|
||||
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_kernel_void, SCAL2V_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_scal2v_unb_var1( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
void bli_scal2v_kernel( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
|
||||
\
|
||||
@@ -48,13 +52,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
void* y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_BASIC( scal2v_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_unb_var1 )
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T scal2v_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -47,17 +48,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_ALL(ftypes,scal2v_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_EXT(ftypes,scal2v_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_unb_var1);
|
||||
static FUNCPTR_T GENARRAY3_MIN(ftypes,scal2v_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scal2v_unb_var1( obj_t* beta,
|
||||
void bli_scal2v_ref( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y )
|
||||
{
|
||||
@@ -94,18 +95,20 @@ void bli_scal2v_unb_var1( obj_t* beta,
|
||||
buf_x, inc_x,
|
||||
buf_y, inc_y );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC3
|
||||
#define GENTFUNC3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname, setvker ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx, \
|
||||
void* y, inc_t incy \
|
||||
) \
|
||||
void PASTEMAC3(chb,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -155,13 +158,13 @@ void PASTEMAC3(chb,chx,chy,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_BASIC( scal2v_ref, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_D( scal2v_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC3_MIX_P( scal2v_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
63
frame/1/scal2v/bli_scal2v_ref.h
Normal file
63
frame/1/scal2v/bli_scal2v_ref.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_scal2v_ref( obj_t* beta,
|
||||
obj_t* x,
|
||||
obj_t* y );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT3
|
||||
#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, varname ) \
|
||||
\
|
||||
void PASTEMAC3(chb,chx,chy,varname) \
|
||||
( \
|
||||
conj_t conjx, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx, \
|
||||
ctype_y* restrict y, inc_t incy \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT3_BASIC( scal2v_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_D( scal2v_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT3_MIX_P( scal2v_ref )
|
||||
#endif
|
||||
|
||||
@@ -66,7 +66,7 @@ void PASTEMAC0(opname)( \
|
||||
x ); \
|
||||
}
|
||||
|
||||
GENFRONT( scalv, SCALV_KERNEL )
|
||||
GENFRONT( scalv, scalv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -36,7 +36,8 @@
|
||||
#include "bli_scalv_check.h"
|
||||
#include "bli_scalv_int.h"
|
||||
|
||||
#include "bli_scalv_unb_var1.h"
|
||||
#include "bli_scalv_kernel.h"
|
||||
#include "bli_scalv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -42,7 +42,7 @@ typedef void (*FUNCPTR_T)( obj_t* beta,
|
||||
static FUNCPTR_T vars[1][3] =
|
||||
{
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_scalv_unb_var1, NULL, NULL }
|
||||
{ bli_scalv_kernel, bli_scalv_kernel, NULL }
|
||||
};
|
||||
|
||||
void bli_scalv_int( obj_t* beta,
|
||||
|
||||
120
frame/1/scalv/bli_scalv_kernel.c
Normal file
120
frame/1/scalv/bli_scalv_kernel.c
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T scalv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conjbeta,
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scalv_kernel( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
conj_t conjbeta = bli_obj_conj_status( *beta );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( conjbeta,
|
||||
n,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chb,chx,kername)( conjbeta, \
|
||||
n, \
|
||||
beta, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( scalv_kernel_void, SCALV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_kernel_void, SCALV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_kernel_void, SCALV_KERNEL )
|
||||
#endif
|
||||
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_scalv_unb_var1( obj_t* beta,
|
||||
obj_t* x );
|
||||
void bli_scalv_kernel( obj_t* beta,
|
||||
obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
@@ -46,13 +50,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( scalv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( scalv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( scalv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( scalv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T scalv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -46,17 +47,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,scalv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,scalv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,scalv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_scalv_unb_var1( obj_t* beta,
|
||||
void bli_scalv_ref( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -88,17 +89,19 @@ void bli_scalv_unb_var1( obj_t* beta,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, setvker ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* x_cast = x; \
|
||||
@@ -136,13 +139,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_BASIC( scalv_ref, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_MIX_D( scalv_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_unb_var1, SETV_KERNEL )
|
||||
INSERT_GENTFUNC2_MIX_P( scalv_ref, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
61
frame/1/scalv/bli_scalv_ref.h
Normal file
61
frame/1/scalv/bli_scalv_ref.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_scalv_ref( obj_t* beta,
|
||||
obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
conj_t conjbeta, \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( scalv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( scalv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( scalv_ref )
|
||||
#endif
|
||||
|
||||
@@ -67,7 +67,7 @@ void PASTEMAC0(opname)( \
|
||||
x ); \
|
||||
}
|
||||
|
||||
GENFRONT( setv, SETV_KERNEL )
|
||||
GENFRONT( setv, setv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -34,8 +34,8 @@
|
||||
|
||||
#include "bli_setv_check.h"
|
||||
|
||||
#include "bli_setv_unb_var1.h"
|
||||
#include "bli_setv_unb_var2.h"
|
||||
#include "bli_setv_kernel.h"
|
||||
#include "bli_setv_ref.h"
|
||||
|
||||
|
||||
//
|
||||
|
||||
113
frame/1/setv/bli_setv_kernel.c
Normal file
113
frame/1/setv/bli_setv_kernel.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T setv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* x, inc_t incx
|
||||
);
|
||||
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_kernel_void);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_kernel_void);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_kernel_void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_setv_kernel( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
|
||||
dim_t n = bli_obj_vector_dim( *x );
|
||||
|
||||
void* buf_x = bli_obj_buffer_at_off( *x );
|
||||
inc_t inc_x = bli_obj_vector_inc( *x );
|
||||
|
||||
void* buf_beta;
|
||||
num_t dt_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// If beta is a scalar constant, use dt_x to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_x, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_beta][dt_x];
|
||||
|
||||
// Invoke the function.
|
||||
f( n,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC2(chb,chx,kername)( n, \
|
||||
beta, \
|
||||
x, incx ); \
|
||||
}
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( setv_kernel_void, SETV_KERNEL )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( setv_kernel_void, SETV_KERNEL )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( setv_kernel_void, SETV_KERNEL )
|
||||
#endif
|
||||
@@ -32,10 +32,14 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_setv_unb_var1( obj_t* beta,
|
||||
obj_t* x );
|
||||
void bli_setv_kernel( obj_t* beta,
|
||||
obj_t* x );
|
||||
|
||||
|
||||
//
|
||||
// Prototype the void pointer kernel wrappers.
|
||||
//
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
@@ -45,13 +49,13 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
void* x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_BASIC( setv_kernel_void )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_D( setv_kernel_void )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( setv_unb_var1 )
|
||||
INSERT_GENTPROT2_MIX_P( setv_kernel_void )
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
/*
|
||||
#define FUNCPTR_T setv_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
@@ -45,17 +46,17 @@ typedef void (*FUNCPTR_T)(
|
||||
// If some mixed datatype functions will not be compiled, we initialize
|
||||
// the corresponding elements of the function array to NULL.
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,setv_ref);
|
||||
#else
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_EXT(ftypes,setv_ref);
|
||||
#else
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_unb_var1);
|
||||
static FUNCPTR_T GENARRAY2_MIN(ftypes,setv_ref);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void bli_setv_unb_var1( obj_t* beta,
|
||||
void bli_setv_ref( obj_t* beta,
|
||||
obj_t* x )
|
||||
{
|
||||
num_t dt_x = bli_obj_datatype( *x );
|
||||
@@ -84,16 +85,17 @@ void bli_setv_unb_var1( obj_t* beta,
|
||||
buf_beta,
|
||||
buf_x, inc_x );
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_b, ctype_x, chb, chx, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname)( \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* x, inc_t incx \
|
||||
) \
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
) \
|
||||
{ \
|
||||
ctype_b* beta_cast = beta; \
|
||||
ctype_x* chi1 = x; \
|
||||
@@ -123,12 +125,12 @@ void PASTEMAC2(chb,chx,varname)( \
|
||||
|
||||
// Define the basic set of functions unconditionally, and then also some
|
||||
// mixed datatype functions if requested.
|
||||
INSERT_GENTFUNC2_BASIC( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_BASIC( setv, setv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_D( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_D( setv, setv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTFUNC2_MIX_P( setv, setv_unb_var1 )
|
||||
INSERT_GENTFUNC2_MIX_P( setv, setv_ref )
|
||||
#endif
|
||||
60
frame/1/setv/bli_setv_ref.h
Normal file
60
frame/1/setv/bli_setv_ref.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
void bli_setv_ref( obj_t* beta,
|
||||
obj_t* x );
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_b, ctype_x, chb, chx, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chb,chx,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype_b* restrict beta, \
|
||||
ctype_x* restrict x, inc_t incx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC( setv_ref )
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_D( setv_ref )
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_MIXED_PRECISION_SUPPORT
|
||||
INSERT_GENTPROT2_MIX_P( setv_ref )
|
||||
#endif
|
||||
|
||||
@@ -53,7 +53,7 @@ void PASTEMAC0(opname)( \
|
||||
y ); \
|
||||
}
|
||||
|
||||
GENFRONT( subv, SUBV_KERNEL )
|
||||
GENFRONT( subv, subv_kernel )
|
||||
|
||||
|
||||
//
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user