Merge commit 'cfa3db3f' into amd-main

* commit 'cfa3db3f':
  Fixed bug in mixed-dt gemm introduced in e9da642.
  Removed support for 3m, 4m induced methods.
  Updated do_sde.sh to get SDE from GitHub.
  Disable SDE testing of old AMD microarchitectures.
  Fixed substitution bug in configure.
  Allow use of 1m with mixing of row/col-pref ukrs.

AMD-Internal: [CPUPL-2698]
Change-Id: I961f0066243cf26aeb2e174e388b470133cc4a5f
This commit is contained in:
Edward Smyth
2024-07-08 05:55:22 -04:00
180 changed files with 2311 additions and 17801 deletions

View File

@@ -92,6 +92,7 @@ but many others have contributed code and feedback, including
Nathaniel Smith @njsmith
Shaden Smith @ShadenSmith
Tyler Smith @tlrmchlsmth (The University of Texas at Austin)
Snehith @ArcadioN09
Paul Springer @springer13 (RWTH Aachen University)
Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign)
Vladimir Sukarev

19
configure vendored
View File

@@ -729,13 +729,21 @@ read_registry_file()
if [ "${mem}" != "${mems_mem}" ]; then
#clist="${config_registry[$config]}"
clist=$(query_array "config_registry" ${config})
clisttmp=$(query_array "config_registry" ${config})
# Replace the current config with its constituent config set,
# canonicalize whitespace, and then remove duplicate config
# set names, if they exist. Finally, update the config registry
# with the new config list.
newclist=$(echo -e "${clist}" | sed -e "s/${mem}/${mems_mem}/g")
# NOTE: WE must use substitute_words() rather than a simple sed
# expression because we need to avoid matching partial strings.
# For example, if clist above contains "foo bar barsk" and we use
# sed to substitute "bee boo" as the members of "bar", the
# result would (incorrectly) be "foo bee boo bee boosk",
# which would then get reduced, via rm_duplicate_words(), to
# "foo bee boo boosk".
#newclist=$(echo -e "${clist}" | sed -e "s/${mem}/${mems_mem}/g")
newclist=$(substitute_words "${mem}" "${mems_mem}" "${clisttmp}")
newclist=$(canonicalize_ws "${newclist}")
newclist=$(rm_duplicate_words "${newclist}")
@@ -818,6 +826,13 @@ read_registry_file()
# canonicalize whitespace, and then remove duplicate kernel
# set names, if they exist. Finally, update the kernel registry
# with the new kernel list.
# NOTE: WE must use substitute_words() rather than a simple sed
# expression because we need to avoid matching partial strings.
# For example, if klist above contains "foo bar barsk" and we use
# sed to substitute "bee boo" as the members of "bar", the
# result would (incorrectly) be "foo bee boo bee boosk",
# which would then get reduced, via rm_duplicate_words(), to
# "foo bee boo boosk".
#newklist=$(echo -e "${klisttmp}" | sed -e "s/${ker}/${kers_ker}/g")
newklist=$(substitute_words "${ker}" "${kers_ker}" "${klisttmp}")
newklist=$(canonicalize_ws "${newklist}")

View File

@@ -2336,16 +2336,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
```
Possible implementation (ie: the `ind_t method` argument) types are:
* `BLIS_3MH`: Implementation based on the 3m method applied at the highest level, outside the 5th loop around the microkernel.
* `BLIS_3M1`: Implementation based on the 3m method applied within the 1st loop around the microkernel.
* `BLIS_4MH`: Implementation based on the 4m method applied at the highest level, outside the 5th loop around the microkernel.
* `BLIS_4M1B`: Implementation based on the 4m method applied within the 1st loop around the microkernel. Computation is ordered such that the 1st loop is fissured into two loops, the first of which multiplies the real part of the current micropanel of packed matrix B (against all real and imaginary parts of packed matrix A), and the second of which multiplies the imaginary part of the current micropanel of packed matrix B.
* `BLIS_4M1A`: Implementation based on the 4m method applied within the 1st loop around the microkernel. Computation is ordered such that real and imaginary components of the current micropanels are completely used before proceeding to the next virtual microkernel invocation.
* `BLIS_1M`: Implementation based on the 1m method. (This is the default induced method when real domain kernels are present but complex kernels are missing.)
* `BLIS_NAT`: Implementation based on "native" execution (ie: NOT an induced method).
**NOTE**: `BLIS_3M3` and `BLIS_3M2` have been deprecated from the `typedef enum` of `ind_t`, and `BLIS_4M1B` is also effectively no longer available, though the `typedef enum` value still exists.
Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_string()`) are:
* `BLIS_REFERENCE_UKERNEL` (`"refrnce"`): This value is returned when the queried microkernel is provided by the reference implementation.
* `BLIS_VIRTUAL_UKERNEL` (`"virtual"`): This value is returned when the queried microkernel is driven by a the "virtual" microkernel provided by an induced method. This happens for any `method` value that is not `BLIS_NAT` (ie: native), but only applies to the complex domain.

View File

@@ -2015,16 +2015,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
```
Possible implementation (ie: the `ind_t method` argument) types are:
* `BLIS_3MH`: Implementation based on the 3m method applied at the highest level, outside the 5th loop around the microkernel.
* `BLIS_3M1`: Implementation based on the 3m method applied within the 1st loop around the microkernel.
* `BLIS_4MH`: Implementation based on the 4m method applied at the highest level, outside the 5th loop around the microkernel.
* `BLIS_4M1B`: Implementation based on the 4m method applied within the 1st loop around the microkernel. Computation is ordered such that the 1st loop is fissured into two loops, the first of which multiplies the real part of the current micropanel of packed matrix B (against all real and imaginary parts of packed matrix A), and the second of which multiplies the imaginary part of the current micropanel of packed matrix B.
* `BLIS_4M1A`: Implementation based on the 4m method applied within the 1st loop around the microkernel. Computation is ordered such that real and imaginary components of the current micropanels are completely used before proceeding to the next virtual microkernel invocation.
* `BLIS_1M`: Implementation based on the 1m method. (This is the default induced method when real domain kernels are present but complex kernels are missing.)
* `BLIS_NAT`: Implementation based on "native" execution (ie: NOT an induced method).
**NOTE**: `BLIS_3M3` and `BLIS_3M2` have been deprecated from the `typedef enum` of `ind_t`, and `BLIS_4M1B` is also effectively no longer available, though the `typedef enum` value still exists.
Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_string()`) are:
* `BLIS_REFERENCE_UKERNEL` (`"refrnce"`): This value is returned when the queried microkernel is provided by the reference implementation.
* `BLIS_VIRTUAL_UKERNEL` (`"virtual"`): This value is returned when the queried microkernel is driven by a the "virtual" microkernel provided by an induced method. This happens for any `method` value that is not `BLIS_NAT` (ie: native), but only applies to the complex domain.

View File

@@ -17,13 +17,9 @@ Simply put, a sandbox in BLIS provides an alternative implementation to the
`gemm` operation.
To get a little more specific, a sandbox provides an alternative implementation
to the function `bli_gemmnat()`, which is the object-based API call for
computing the `gemm` operation via native execution.
**Note**: Native execution simply means that an induced method will not be used.
It's what you probably already think of when you think of implementing the
`gemm` operation: a series of loops around an optimized (usually assembly-based)
microkernel with some packing functions thrown in at various levels.
to the function `bli_gemm_ex()`, which is the
[expert interface](BLISObjectAPI.md##basic-vs-expert-interfaces) for calling the
[object-based API](BLISObjectAPI.md#gemm) for the `gemm` operation.
Why sandboxes? Sometimes you want to experiment with tweaks or changes to
the `gemm` operation, but you want to do so in a simple environment rather than
@@ -45,18 +41,11 @@ corresponds to a sub-directory of `sandbox` named `gemmlike`. (Reminder: the
`auto` argument is the configuration target and thus unrelated to
sandboxes.)
NOTE: If you want your sandbox implementation to handle *all* problem
sizes and shapes, you'll need to disable the skinny/unpacked "sup"
sub-framework within BLIS, which is enabled by default. This can be
done by passing the `--disable-sup-handling` option to configure:
```
$ ./configure --enable-sandbox=gemmlike --disable-sup-handling auto
```
If you leave sup enabled, the sup implementation will, at runtime, detect
and handle certain smaller problem sizes upstream of where BLIS calls
`bli_gemmnat()` while all other problems will fall to your sandbox
implementation. Thus, you should only leave sup enabled if you are fine
with those smaller problems being handled by sup.
NOTE: Using your own sandbox implementation means that BLIS will call your
sandbox for *all* problem sizes and shapes, for *all* datatypes supported
by BLIS. If you intend to only implement a subset of this functionality
within your sandbox, you should be sure to redirect execution back into
the core framework for the parts that you don't wish to reimplement yourself.
As `configure` runs, you should get output that includes lines
similar to:
@@ -67,13 +56,12 @@ configure: sandbox/gemmlike
And when you build BLIS, the last files to be compiled will be the source
code in the specified sandbox:
```
Compiling obj/haswell/sandbox/gemmlike/bli_gemmnat.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/gemmlike/bls_gemm.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/gemmlike/bls_gemm_bp_var1.o ('haswell' CFLAGS for sandboxes)
...
```
That's it! After the BLIS library is built, it will contain your chosen
sandbox's implementation of `bli_gemmnat()` instead of the default
sandbox's implementation of `bli_gemm_ex()` instead of the default BLIS
implementation.
## Sandbox rules
@@ -97,7 +85,7 @@ Note that `blis.h` already contains all of its definitions inside of an
`extern "C"` block, so you should be able to `#include "blis.h"` from your
C++11 source code without any issues.
3. All of your code to replace BLIS's default implementation of `bli_gemmnat()`
3. All of your code to replace BLIS's default implementation of `bli_gemm_ex()`
should reside in the named sandbox directory, or some directory therein.
(Obviously.) For example, the "gemmlike" sandbox is located in
`sandbox/gemmlike`. All of the code associated with this sandbox will be
@@ -105,7 +93,7 @@ contained within `sandbox/gemmlike`. Note that you absolutely *may* include
additional code and interfaces within the sandbox, if you wish -- code and
interfaces that are not directly or indirectly needed for satisfying the
the "contract" set forth by the sandbox (i.e., including a local definition
of`bli_gemmnat()`).
of`bli_gemm_ex()`).
4. The *only* header file that is required of your sandbox is `bli_sandbox.h`.
It must be named `bli_sandbox.h` because `blis.h` will `#include` this file
@@ -119,12 +107,12 @@ you should only place things (e.g. prototypes or type definitions) in
(b) an *application* that calls your sandbox-enabled BLIS library.
Usually, neither of these situations will require any of your local definitions
since those local definitions are only needed to define your sandbox
implementation of `bli_gemmnat()`, and this function is already prototyped by
implementation of `bli_gemm_ex()`, and this function is already prototyped by
BLIS. *But if you are adding additional APIs and/or operations to the sandbox
that are unrelated to `bli_gemmnat()`, then you'll want to `#include` those
that are unrelated to `bli_gemm_ex()`, then you'll want to `#include` those
function prototypes from within `bli_sandbox.h`*
5. Your definition of `bli_gemmnat()` should be the **only function you define**
5. Your definition of `bli_gemm_ex()` should be the **only function you define**
in your sandbox that begins with `bli_`. If you define other functions that
begin with `bli_`, you risk a namespace collision with existing framework
functions. To guarantee safety, please prefix your locally-defined sandbox
@@ -147,9 +135,9 @@ For example, with a BLIS sandbox you **can** do the following kinds of things:
kernels, which can already be customized within each sub-configuration);
- try inlining your functions manually;
- pivot away from using `obj_t` objects at higher algorithmic level (such as
immediately after calling `bli_gemmnat()`) to try to avoid some overhead;
immediately after calling `bli_gemm_ex()`) to try to avoid some overhead;
- create experimental implementations of new BLAS-like operations (provided
that you also provide an implementation of `bli_gemmnat()`).
that you also provide an implementation of `bli_gemm_ex()`).
You **cannot**, however, use a sandbox to do the following kinds of things:
- define new datatypes (half-precision, quad-precision, short integer, etc.)
@@ -167,8 +155,8 @@ Another important limitation is the fact that the build system currently uses
# Example framework CFLAGS used by 'haswell' sub-configuration
-O3 -Wall -Wno-unused-function -Wfatal-errors -fPIC -std=c99
-D_POSIX_C_SOURCE=200112L -I./include/haswell -I./frame/3/
-I./frame/ind/ukernels/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/
-I./frame/include -DBLIS_VERSION_STRING=\"0.3.2-51\"
-I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include
-DBLIS_VERSION_STRING=\"0.3.2-51\"
```
which are likely more general-purpose than the `CFLAGS` used for, say,
optimized kernels or even reference kernels.
@@ -176,8 +164,8 @@ optimized kernels or even reference kernels.
# Example optimized kernel CFLAGS used by 'haswell' sub-configuration
-O3 -mavx2 -mfma -mfpmath=sse -march=core-avx2 -Wall -Wno-unused-function
-Wfatal-errors -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L -I./include/haswell
-I./frame/3/ -I./frame/ind/ukernels/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/
-I./frame/include -DBLIS_VERSION_STRING=\"0.3.2-51\"
-I./frame/3/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include
-DBLIS_VERSION_STRING=\"0.3.2-51\"
```
(To see precisely which flags are being employed for any given file, enable
verbosity at compile-time via `make V=1`.) Compiling sandboxes with these more

View File

@@ -128,11 +128,6 @@ sdcz # Datatype(s) to test:
300 # Problem size: maximum to test
100 # Problem size: increment between experiments
# Complex level-3 implementations to test
1 # 3mh ('1' = enable; '0' = disable)
1 # 3m1 ('1' = enable; '0' = disable)
1 # 4mh ('1' = enable; '0' = disable)
1 # 4m1b ('1' = enable; '0' = disable)
1 # 4m1a ('1' = enable; '0' = disable)
1 # 1m ('1' = enable; '0' = disable)
1 # native ('1' = enable; '0' = disable)
1 # Simulate application-level threading:
@@ -169,7 +164,7 @@ _**Test gemm with mixed-precision operands?**_ This boolean determines whether `
_**Problem size.**_ These values determine the first problem size to test, the maximum problem size to test, and the increment between problem sizes. Note that the maximum problem size only bounds the range of problem sizes; it is not guaranteed to be tested. Example: If the initial problem size is 128, the maximum is 1000, and the increment is 64, then the last problem size to be tested will be 960.
_**Complex level-3 implementations to test.**_ With the exception of the switch marked `native`, these switches control whether experimental complex domain implementations are tested (when applicable). These implementations employ induced methods complex matrix multiplication and apply to some (though not all) of the level-3 operations. If you don't know what these are, you can ignore them. The `native` switch corresponds to native execution of complex domain level-3 operations, which we test by default. We also test the `1m` method, since it is the induced method of choice when complex microkernels are not available. Note that all of these induced method tests (including `native`) are automatically disabled if the `c` and `z` datatypes are disabled.
_**Complex level-3 implementations to test.**_ This section lists which complex domain implementations of level-3 operations are tested. If you don't know what these are, you can ignore them. The `native` switch corresponds to native execution of complex domain level-3 operations, which we test by default. We also test the `1m` method, since it is the induced method of choice when optimized complex microkernels are not available. Note that all of these induced method tests (including `native`) are automatically disabled if the `c` and `z` datatypes are disabled.
_**Simulate application-level threading.**_ This setting specifies the number of threads the testsuite will spawn, and is meant to allow the user to exercise BLIS as a multithreaded application might if it were to make multiple concurrent calls to BLIS operations. (Note that the threading controlled by this option is orthogonal to, and has no effect on, whatever multithreading may be employed _within_ BLIS, as specified by the environment variables described in the [Multithreading](Multithreading.md) documentation.) When this option is set to 1, the testsuite is run with only one thread. When set to n > 1 threads, the spawned threads will parallelize (in round-robin fashion) the total set of tests specified by the testsuite input files, executing them in roughly the same order as that of a sequential execution.

View File

@@ -110,28 +110,6 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
INSERT_GENTDEF( unpackm_cxk )
// packm_3mis_ker
// packm_4mi_ker
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t is_p, inc_t ldp, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( packm_cxk_3mis )
INSERT_GENTDEF( packm_cxk_4mi )
// packm_rih_ker
// packm_1er_ker
#undef GENTDEF
@@ -150,12 +128,8 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( packm_cxk_rih )
INSERT_GENTDEF( packm_cxk_1er )
#endif

View File

@@ -74,51 +74,6 @@ INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
// 3mis packm kernels
#undef GENTPROT
#define GENTPROT PACKM_3MIS_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_3mis_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_3mis_ker_name )
// 4mi packm kernels
#undef GENTPROT
#define GENTPROT PACKM_4MI_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_4mi_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_4mi_ker_name )
// rih packm kernels
#undef GENTPROT
#define GENTPROT PACKM_RIH_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_rih_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_rih_ker_name )
// 1e/1r packm kernels
#undef GENTPROT

View File

@@ -70,58 +70,6 @@ void PASTEMAC(ch,varname) \
);
// 3mis packm kernels
#define PACKM_3MIS_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t is_p, inc_t ldp, \
cntx_t* restrict cntx \
);
// 4mi packm kernels
#define PACKM_4MI_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t is_p, inc_t ldp, \
cntx_t* restrict cntx \
);
// rih packm kernels
#define PACKM_RIH_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
);
// 1e/1r packm kernels
#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \

View File

@@ -43,15 +43,9 @@
#include "bli_packm_var.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_4mi.h"
#include "bli_packm_struc_cxk_3mis.h"
#include "bli_packm_struc_cxk_rih.h"
#include "bli_packm_struc_cxk_1er.h"
#include "bli_packm_cxk.h"
#include "bli_packm_cxk_4mi.h"
#include "bli_packm_cxk_3mis.h"
#include "bli_packm_cxk_rih.h"
#include "bli_packm_cxk_1er.h"
#include "bli_pack_full.h"

View File

@@ -71,31 +71,10 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 4m interleaved
{ { NULL, bli_cpackm_struc_cxk_4mi,
NULL, bli_zpackm_struc_cxk_4mi, } },
// 0010 row/col panels: 3m interleaved
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0011 row/col panels: 4m separated (NOT IMPLEMENTED)
{ { NULL, NULL,
NULL, NULL, } },
// 0100 row/col panels: 3m separated
{ { NULL, bli_cpackm_struc_cxk_3mis,
NULL, bli_zpackm_struc_cxk_3mis, } },
// 0101 row/col panels: real only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0110 row/col panels: imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 0111 row/col panels: real+imaginary only
{ { NULL, bli_cpackm_struc_cxk_rih,
NULL, bli_zpackm_struc_cxk_rih, } },
// 1000 row/col panels: 1m-expanded (1e)
// 0001 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 1001 row/col panels: 1m-reordered (1r)
// 0010 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
@@ -205,15 +184,6 @@ void bli_packm_blk_var1
}
#if 0
if ( bli_is_4mi_packed( schema ) ) packm_kers = packm_struc_cxk_4mi_kers;
else if ( bli_is_3mi_packed( schema ) ||
bli_is_3ms_packed( schema ) ) packm_kers = packm_struc_cxk_3mis_kers;
else if ( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers;
else packm_kers = packm_struc_cxk_kers;
#else
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
@@ -239,7 +209,6 @@ void bli_packm_blk_var1
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
#endif
// Query the datatype-specific function pointer from the func_t object.
@@ -337,8 +306,6 @@ void PASTEMAC(ch,varname) \
bool row_stored; \
bool col_stored; \
inc_t is_p_use; \
dim_t ss_num; \
dim_t ss_den; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
@@ -409,17 +376,6 @@ void PASTEMAC(ch,varname) \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the storage stride scaling. Usually this is just 1. However,
in the case of interleaved 3m, we need to scale by 3/2, and in the
cases of real-only, imag-only, or summed-only, we need to scale by
1/2. In both cases, we are compensating for the fact that pointer
arithmetic occurs in terms of complex elements rather than real
elements. */ \
if ( bli_is_3mi_packed( schema ) ) { ss_num = 3; ss_den = 2; } \
else if ( bli_is_3ms_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else if ( bli_is_rih_packed( schema ) ) { ss_num = 1; ss_den = 2; } \
else { ss_num = 1; ss_den = 1; } \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -550,7 +506,7 @@ void PASTEMAC(ch,varname) \
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = ( is_p_use * ss_num ) / ss_den; \
p_inc = is_p_use; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
@@ -706,29 +662,6 @@ bli_thread_barrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \

View File

@@ -1,204 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
f \
( \
conja, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, is_p, ldp, \
cntx \
); \
} \
else \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC(ch,scal2ri3s_mxn) \
( \
conja, \
panel_dim, \
panel_len, \
kappa, \
a, inca, lda, \
p, 1, ldp, is_p \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - i; \
const dim_t n_edge = panel_len_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*1; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*1; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*1; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, 1, ldp, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, 1, ldp, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*ldp; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, 1, ldp, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, 1, ldp, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_cxk_3mis )

View File

@@ -1,53 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_cxk_3mis )

View File

@@ -1,146 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
f \
( \
conja, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, is_p, ldp, \
cntx \
); \
} \
else \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC(ch,scal2ris_mxn) \
( \
conja, \
panel_dim, \
panel_len, \
kappa, \
a, inca, lda, \
p, 1, ldp, is_p \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim != panel_dim_max ) \
{ \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - i; \
const dim_t n_edge = panel_len_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*1; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*1; \
\
PASTEMAC(chr,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge_r, 1, ldp \
); \
PASTEMAC(chr,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge_i, 1, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len != panel_len_max ) \
{ \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*ldp; \
\
PASTEMAC(chr,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge_r, 1, ldp \
); \
PASTEMAC(chr,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge_i, 1, ldp \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_cxk_4mi )

View File

@@ -1,53 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_cxk_4mi )

View File

@@ -1,151 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( 0 && f != NULL ) \
{ \
f \
( \
conja, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, ldp, \
cntx \
); \
} \
else \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC(ch,scal2rihs_mxn) \
( \
schema, \
conja, \
panel_dim, \
panel_len, \
kappa, \
a, inca, lda, \
p, 1, ldp \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim != panel_dim_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - i; \
const dim_t n_edge = panel_len_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*1; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len != panel_len_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*ldp; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_cxk_rih )

View File

@@ -1,54 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_cxk_rih )

View File

@@ -113,52 +113,6 @@ siz_t bli_packm_init
return 0;
}
#if 0
pack_t schema;
if ( bli_cntx_method( cntx ) != BLIS_NAT )
{
// We now ignore the pack_schema field in the control tree and
// extract the schema from the context, depending on whether we are
// preparing to pack a block of A or panel of B. For A and B, we must
// obtain the schema from the context since the induced methods reuse
// the same control trees used by native execution, and those induced
// methods specify the schema used by the current execution phase
// within the context (whereas the control tree does not change).
if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK )
{
schema = bli_cntx_schema_a_block( cntx );
}
else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL )
{
schema = bli_cntx_schema_b_panel( cntx );
}
else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{
schema = bli_cntl_packm_params_pack_schema( cntl );
}
}
else // ( bli_cntx_method( cntx ) == BLIS_NAT )
{
// For native execution, we obtain the schema from the control tree
// node. (Notice that it doesn't matter if the pack_buf_type is for
// A or B.)
schema = bli_cntl_packm_params_pack_schema( cntl );
}
// This is no longer needed now that we branch between native and
// non-native cases above.
#if 0
if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL )
{
// If we get a request to pack C for some reason, it is likely
// not part of an induced method, and so it would be safe (and
// necessary) to read the pack schema from the control tree.
schema = bli_cntl_packm_params_pack_schema( cntl );
}
#endif
#endif
// Prepare a few other variables based on properties of the control
// tree.
@@ -393,7 +347,7 @@ siz_t bli_packm_init_pack
bli_is_panel_packed( schema ) )
{
dim_t m_panel;
dim_t ps_p, ps_p_orig;
dim_t ps_p;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the m dimension.
@@ -418,58 +372,17 @@ siz_t bli_packm_init_pack
// dimension of the matrix is not a whole multiple of MR.
ps_p = cs_p * n_p_pad;
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
// As a general rule, we don't want micropanel strides to be odd.
// NOTE: This safety feature *may* not be necessary anymore, but was
// definitely needed to support certain variations of the 3m method.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Preserve this early panel stride value for use later, if needed.
ps_p_orig = ps_p;
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
// the correct amount.
if ( bli_is_3mi_packed( schema ) )
{
ps_p = ( ps_p * 3 ) / 2;
}
else if ( bli_is_3ms_packed( schema ) ||
bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) )
{
// The division by 2 below assumes that ps_p is an even number.
// However, it is possible that, at this point, ps_p is an odd.
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
}
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel );
else is_p = 1;
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
@@ -486,7 +399,7 @@ siz_t bli_packm_init_pack
bli_is_panel_packed( schema ) )
{
dim_t n_panel;
dim_t ps_p, ps_p_orig;
dim_t ps_p;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the n dimension.
@@ -512,58 +425,17 @@ siz_t bli_packm_init_pack
// dimension of the matrix is not a whole multiple of NR.
ps_p = m_p_pad * rs_p;
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
// As a general rule, we don't want micropanel strides to be odd.
// NOTE: This safety feature *may* not be necessary anymore, but was
// definitely needed to support certain variations of the 3m method.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Preserve this early panel stride value for use later, if needed.
ps_p_orig = ps_p;
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
// the correct amount.
if ( bli_is_3mi_packed( schema ) )
{
ps_p = ( ps_p * 3 ) / 2;
}
else if ( bli_is_3ms_packed( schema ) ||
bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) )
{
// The division by 2 below assumes that ps_p is an even number.
// However, it is possible that, at this point, ps_p is an odd.
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
}
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel );
else is_p = 1;
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );

View File

@@ -1,842 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_3mis) \
( \
strucc, \
diagoffc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_3mis) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp, \
cntx \
); \
} \
\
\
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
fill the edge region (the bottom m_panel_max - m_panel rows or right-
side n_panel_max - n_panel columns) of the micropanel with zeros.
However, this responsibility has been moved to the packm microkernel.
This change allows experts to use custom kernels that pack to custom
packing formats when the problem size is not a nice multiple of the
register blocksize. */ \
/*
if ( m_panel != m_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (i )*rs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
*/ \
\
/*
if ( n_panel != n_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
ctype_r* p_edge_rpi = ( ctype_r* )p + 2*is_p + (j )*cs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_rpi, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
*/ \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_3mis, packm_cxk_3mis )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool row_stored; \
bool col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype_r* restrict p_r = ( ctype_r* )p; \
\
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
\
ctype* restrict c10; \
ctype_r* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype_r* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_r; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_r; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
( ctype* )p10, is_p, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
( ctype* )p12, is_p, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
inc_t rs_c11 = 2*rs_c; \
inc_t cs_c11 = 2*cs_c; \
dim_t j2 = diagoffc_abs; \
ctype* c11 = ( ctype* )c + (j2 )*ldc; \
ctype_r* p11 = ( ctype_r* )p_r + (j2 )*ldp; \
ctype_r* c11_r = ( ctype_r* )c11; \
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
ctype_r* p11_r = ( ctype_r* )p11; \
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
ctype_r* alpha_r = one_r; \
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
PASTEMAC2(chr,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_r, \
c11_r, rs_c11, cs_c11, \
p11_r, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
scaling by -1 if conjugation on c was requested. */ \
PASTEMAC2(chr,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_i, \
c11_i, rs_c11, cs_c11, \
p11_i, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(chr,set0s)( *pi11_i ); \
} \
} \
\
/* Apply kappa to the part of p11 that corresponds to the stored
part of c11 that was copied above. */ \
if ( bli_is_upper( uploc ) ) \
{ \
PASTEMAC(ch,scalris_mxn_u) \
( \
0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p, cs_p \
); \
} \
else \
{ \
PASTEMAC(ch,scalris_mxn_l) \
( \
0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p, cs_p \
); \
} \
\
/* Update the p11 section of the ri panel. It simply needs
to contain the sum of p11_r + p11_i. */ \
{ \
ctype_r* p11_rpi = p11_i + is_p; \
\
for ( j = 0; j < p11_n; ++j ) \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (j )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (j )*cs_p; \
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC(chr,add3s) \
( \
*pi11_r, \
*pi11_i, \
*pi11_rpi \
); \
} \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_3mis, packm_cxk_3mis )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
ctype_r* p_r = ( ctype_r* )p + 0; \
ctype_r* p_i = ( ctype_r* )p + is_p; \
ctype_r* p_rpi = ( ctype_r* )p + 2*is_p; \
\
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = p_r + (j )*ldp; \
ctype_r* p11_i = p_i + (j )*ldp; \
ctype_r* p11_rpi = p_rpi + (j )*ldp; \
\
dim_t p11_m = m_panel; \
dim_t p11_n = n_panel; \
\
dim_t min_p11_m_n; \
\
if ( diagoffp < 0 ) p11_m -= j; \
else if ( diagoffp > 0 ) p11_n -= j; \
\
min_p11_m_n = bli_min( p11_m, p11_n ); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
dim_t i; \
\
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
m_panel, \
n_panel, \
&kappa_r, \
p_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
m_panel, \
n_panel, \
&kappa_i, \
p_i, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* Update the diagonal of the p11 section of the rpi panel.
It simply needs to contain the sum of diagonals of p11_r
and p11_i. */ \
for ( i = 0; i < min_p11_m_n; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_rpi = p11_rpi + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(chr,add3s)( *pi11_r, *pi11_i, *pi11_rpi ); \
} \
} \
\
/* If requested, invert the diagonal of the packed panel. Note
that we do not need to update the ri panel since inverted
diagonals are only needed by trsm, which does not use the
p11 section of the ri panel. */ \
if ( invdiag == TRUE ) \
{ \
dim_t i; \
\
for ( i = 0; i < min_p11_m_n; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_i, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_rpi, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_3mis, packm_cxk_3mis )

View File

@@ -1,121 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_3mis )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_3mis )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_3mis )

View File

@@ -1,757 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_4mi) \
( \
strucc, \
diagoffc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_4mi) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
is_p, ldp, \
cntx \
); \
} \
\
\
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
fill the edge region (the bottom m_panel_max - m_panel rows or right-
side n_panel_max - n_panel columns) of the micropanel with zeros.
However, this responsibility has been moved to the packm microkernel.
This change allows experts to use custom kernels that pack to custom
packing formats when the problem size is not a nice multiple of the
register blocksize. */ \
/*
if ( m_panel != m_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (i )*rs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
ctype_r* p_edge_i = ( ctype_r* )p + is_p + (j )*cs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
*/ \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
ctype_r* p_br_i = ( ctype_r* )p + is_p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one_r, \
p_br_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
zero_r, \
p_br_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_4mi, packm_cxk_4mi )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool row_stored; \
bool col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype_r* restrict p_r = ( ctype_r* )p; \
\
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
\
ctype* restrict c10; \
ctype_r* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype_r* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_r; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_r; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
( ctype* )p10, is_p, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
( ctype* )p12, is_p, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
inc_t rs_c11 = 2*rs_c; \
inc_t cs_c11 = 2*cs_c; \
dim_t j2 = diagoffc_abs; \
ctype* c11 = ( ctype* )c + (j2 )*ldc; \
ctype_r* p11 = ( ctype_r* )p_r + (j2 )*ldp; \
ctype_r* c11_r = ( ctype_r* )c11; \
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
ctype_r* p11_r = ( ctype_r* )p11; \
ctype_r* p11_i = ( ctype_r* )p11 + is_p; \
ctype_r* alpha_r = one_r; \
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
PASTEMAC2(chr,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_r, \
c11_r, rs_c11, cs_c11, \
p11_r, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
scaling by -1 if conjugation on c was requested. */ \
PASTEMAC2(chr,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
BLIS_NO_TRANSPOSE, \
p11_m, \
p11_n, \
alpha_i, \
c11_i, rs_c11, cs_c11, \
p11_i, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
for ( i = 0; i < p11_m; ++i ) \
{ \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(chr,set0s)( *pi11_i ); \
} \
} \
\
/* Apply kappa to the part of p11 that corresponds to the stored
part of c11 that was copied above. */ \
if ( bli_is_upper( uploc ) ) \
{ \
PASTEMAC(ch,scalris_mxn_u) \
( \
0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p, cs_p \
); \
} \
else \
{ \
PASTEMAC(ch,scalris_mxn_l) \
( \
0, \
p11_m, \
p11_n, \
&kappa_r, \
&kappa_i, \
p11_r, \
p11_i, rs_p, cs_p \
); \
} \
/*
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_4mi, packm_cxk_4mi )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, is_p, ldp, \
cntx \
); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
ctype_r* p_r = ( ctype_r* )p; \
ctype_r* p_i = ( ctype_r* )p + is_p; \
\
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = p_r + (j )*ldp; \
ctype_r* p11_i = p_i + (j )*ldp; \
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
m_panel, \
n_panel, \
&kappa_r, \
p_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
m_panel, \
n_panel, \
&kappa_i, \
p_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
\
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
dim_t i; \
\
for ( i = 0; i < panel_dim; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
\
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
} \
} \
\
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_r, rs_p, cs_p, \
cntx, \
NULL \
); \
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_i, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_4mi, packm_cxk_4mi )

View File

@@ -1,121 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_4mi )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_4mi )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_4mi )

View File

@@ -1,625 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_rih) \
( \
strucc, \
diagoffc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_rih) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
); \
} \
\
\
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
fill the edge region (the bottom m_panel_max - m_panel rows or right-
side n_panel_max - n_panel columns) of the micropanel with zeros.
However, this responsibility has been moved to the packm microkernel.
This change allows experts to use custom kernels that pack to custom
packing formats when the problem size is not a nice multiple of the
register blocksize. */ \
/*
if ( m_panel != m_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero_r, \
p_edge_r, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
*/ \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
/* We don't need this case if we aren't supporting trsm.
Why? Because trmm's packm control tree node should be
using k dimension multiples of 1 (kr == 1), which means
there will never be zero padding at the far end of a
micro-panel. */ \
} \
} \
\
\
/*
{ \
if ( bli_is_col_packed( schema ) ) \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: bp copied", m_panel_max, n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
else if ( bli_is_row_packed( schema ) ) \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_rih: ap copied", m_panel_max, n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
\
\
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_rih, packm_cxk_rih )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
) \
{ \
bool row_stored; \
bool col_stored; \
doff_t diagoffc_abs; \
dim_t j; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
{ \
ctype_r* restrict p_r = ( ctype_r* )p; \
\
ctype* restrict c10; \
ctype_r* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype_r* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p_r; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p_r; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p_r + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
( ctype* )p10, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
( ctype* )p12, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t j2 = diagoffc_abs; \
/*ctype_r* restrict p_r = ( ctype_r* )p;*/ \
ctype* restrict c11 = c + (j2 )*ldc; \
ctype_r* restrict p11_r = p_r + (j2 )*ldp; \
\
PASTEMAC(ch,scal2rihs_mxn_uplo) \
( \
schema, \
uploc, \
conjc, \
panel_dim, \
kappa, \
c11, rs_c, cs_c, \
p11_r, rs_p, cs_p \
); \
\
/* If we are packing a micro-panel with Hermitian structure,
we must take special care of the diagonal. Now, if kappa
were guaranteed to be unit, all we would need to do is
explicitly zero out the imaginary part of the diagonal of
p11, in case the diagonal of the source matrix contained
garbage (non-zero) imaginary values. HOWEVER, since kappa
can be non-unit, things become a little more complicated.
In general, we must re-apply the kappa scalar to ONLY the
real part of the diagonal of the source matrix and save
the result to the diagonal of p11. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
PASTEMAC3(ch,chr,ch,scal2rihs_mxn_diag) \
( \
schema, \
panel_dim, \
panel_dim, \
kappa, \
c11, rs_c, cs_c, \
p11_r, rs_p, cs_p \
); \
} \
\
/*
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
p_r + 0*is_p, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
p_r + 1*is_p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_rih, packm_cxk_rih )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
ctype_r* p_r = ( ctype_r* )p; \
\
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = p_r + (j )*ldp; \
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC(ch,setrihs_mxn_diag) \
( \
schema, \
panel_dim, \
panel_dim, \
kappa, \
p11_r, rs_p, cs_p \
); \
} \
\
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
/* We don't need this case if we aren't supporting trsm. */ \
} \
\
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). */ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
\
PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p_r, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_rih, packm_cxk_rih )

View File

@@ -1,121 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_rih )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_rih )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_rih )

View File

@@ -46,24 +46,15 @@
#include "bli_l3_direct.h"
#include "bli_l3_prune.h"
#include "bli_l3_packm.h"
#include "bli_l3_schema.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
// Prototype object APIs (basic and expert).
#include "bli_l3_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_l3_oapi_ex.h"
#include "bli_oapi_ba.h"
#include "bli_l3_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
// Prototype typed APIs (basic and expert).
#include "bli_l3_tapi.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l3_tapi.h"
#include "bli_xapi_undef.h"
#include "bli_l3_tapi_ex.h"
// Define function types for small/unpacked handlers/kernels.
#include "bli_l3_sup_oft.h"

View File

@@ -99,7 +99,7 @@ void bli_hemm_check
{
err_t e_val;
// Perform checks common to hemm/symm.
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
@@ -249,7 +249,7 @@ void bli_syr2k_check
bli_check_error_code( e_val );
}
void bli_trmm_check
void bli_trmm3_check
(
side_t side,
obj_t* alpha,
@@ -262,7 +262,7 @@ void bli_trmm_check
{
err_t e_val;
// Perform checks common to hemm/symm.
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
@@ -272,22 +272,41 @@ void bli_trmm_check
bli_check_error_code( e_val );
}
void bli_trmm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
}
void bli_trsm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm.
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
// Check object structure.

View File

@@ -72,8 +72,7 @@ void PASTEMAC(opname,_check) \
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm )
GENPROT( trsm )
GENPROT( trmm3 )
#undef GENPROT
@@ -92,6 +91,22 @@ GENPROT( herk )
GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx \
);
GENPROT( trmm )
GENPROT( trsm )
// -----------------------------------------------------------------------------
void bli_gemm_basic_check

View File

@@ -35,23 +35,13 @@
#include "blis.h"
static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
// This array tracks whether a particular operation is implemented for each of
// the induced methods.
static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
{
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm gemmt*/
/* 3mh */ { bli_gemm3mh, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL , NULL },
/* 3m1 */ { bli_gemm3m1, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 , NULL },
/* 4mh */ { bli_gemm4mh, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL , NULL },
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL , NULL },
/* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 , NULL },
/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m , NULL },
/* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat , bli_gemmtnat },
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
/* 1m */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE },
/* nat */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE }
};
//
@@ -64,21 +54,11 @@ static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
static BLIS_THREAD_LOCAL
bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
{
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
/* c z */
/* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 3m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4mb */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
{TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} },
};
@@ -87,16 +67,14 @@ bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
#undef GENFUNC
#define GENFUNC( opname, optype ) \
\
void_fp PASTEMAC(opname,ind_get_avail)( num_t dt ) \
ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \
{ \
return bli_ind_oper_get_avail( optype, dt ); \
return bli_l3_ind_oper_find_avail( optype, dt ); \
}
/*
bool PASTEMAC(opname,ind_has_avail)( num_t dt )
{
return bli_ind_oper_has_avail( optype, dt );
}
*/
//bool PASTEMAC(opname,ind_has_avail)( num_t dt )
//{
// return bli_ind_oper_has_avail( optype, dt );
//}
GENFUNC( gemm, BLIS_GEMM )
GENFUNC( gemmt, BLIS_GEMMT )
@@ -115,16 +93,16 @@ GENFUNC( trsm, BLIS_TRSM )
#if 0
bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt )
{
void_fp func;
bool stat;
bool enabled;
bool stat;
// If the datatype is real, it is never available.
if ( !bli_is_complex( dt ) ) return FALSE;
func = bli_l3_ind_oper_get_func( oper, method );
stat = bli_l3_ind_oper_get_enable( oper, method, dt );
enabled = bli_l3_ind_oper_is_impl( oper, method );
stat = bli_l3_ind_oper_get_enable( oper, method, dt );
return ( func != NULL && stat == TRUE );
return ( enabled == TRUE && stat == TRUE );
}
#endif
@@ -147,11 +125,11 @@ ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt )
// current operation and datatype.
for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im )
{
void_fp func = bli_l3_ind_oper_get_func( oper, im );
bool stat = bli_l3_ind_oper_get_enable( oper, im, dt );
bool enabled = bli_l3_ind_oper_is_impl( oper, im );
bool stat = bli_l3_ind_oper_get_enable( oper, im, dt );
if ( func != NULL &&
stat == TRUE ) return im;
if ( enabled == TRUE &&
stat == TRUE ) return im;
}
// This return statement should never execute since the native index
@@ -257,8 +235,7 @@ bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt )
// -----------------------------------------------------------------------------
void_fp bli_l3_ind_oper_get_func( opid_t oper, ind_t method )
bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method )
{
return bli_l3_ind_oper_fp[ method ][ oper ];
return bli_l3_ind_oper_impl[ method ][ oper ];
}

View File

@@ -41,7 +41,7 @@
#undef GENPROT
#define GENPROT( opname ) \
\
void_fp PASTEMAC(opname,ind_get_avail)( num_t dt );
ind_t PASTEMAC(opname,ind_find_avail)( num_t dt );
/*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */
GENPROT( gemm )
@@ -70,7 +70,7 @@ void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status );
void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status );
bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt );
void_fp bli_l3_ind_oper_get_func( opid_t oper, ind_t method );
bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method );
#endif

View File

@@ -53,11 +53,6 @@ void PASTEMAC(ch,opname) \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( gemm3mh_ukr_name )
INSERT_GENTPROT_BASIC0( gemm3m1_ukr_name )
INSERT_GENTPROT_BASIC0( gemm4mh_ukr_name )
INSERT_GENTPROT_BASIC0( gemm4mb_ukr_name )
INSERT_GENTPROT_BASIC0( gemm4m1_ukr_name )
INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
@@ -77,10 +72,6 @@ void PASTEMAC(ch,opname) \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( gemmtrsm3m1_l_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm3m1_u_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm4m1_l_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm4m1_u_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm1m_u_ukr_name )
@@ -97,10 +88,6 @@ void PASTEMAC(ch,opname) \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( trsm3m1_l_ukr_name )
INSERT_GENTPROT_BASIC0( trsm3m1_u_ukr_name )
INSERT_GENTPROT_BASIC0( trsm4m1_l_ukr_name )
INSERT_GENTPROT_BASIC0( trsm4m1_u_ukr_name )
INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
INSERT_GENTPROT_BASIC0( trsm1m_u_ukr_name )

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Texas at Austin
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -33,217 +33,31 @@
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
#include "blis.h"
//
// Define object-based interfaces.
// Define object-based interfaces (basic).
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
}\
\
/* if alpha or A or B has a zero dimension, \
scale C by beta and return early. */ \
if ( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) || \
bli_obj_has_zero_dim( b ) ) \
{\
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return;\
}\
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if ( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler. If it finds that the problem
does not fall within the thresholds that define "small", or for some
other reason decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Only proceed with an induced method if each of the operands have a
complex storage datatype. NOTE: Allowing precisions to vary while
using 1m, which is what we do here, is unique to gemm; other level-3
operations use 1m only if all storage datatypes are equal (and they
ignore the computation precision). If any operands are real, skip the
induced method chooser function and proceed directly with native
execution. */ \
if ( bli_obj_is_complex( c ) && \
bli_obj_is_complex( a ) && \
bli_obj_is_complex( b ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( alpha, a, b, beta, c, NULL, NULL ); \
}
GENFRONT( gemm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
}\
\
/* if alpha or A or B has a zero dimension, \
scale C by beta and return early. */ \
if ( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) || \
bli_obj_has_zero_dim( b ) ) \
{\
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return;\
}\
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if ( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler. If it finds that the problem
does not fall within the thresholds that define "small", or for some
other reason decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Only proceed with an induced method if each of the operands have a
complex storage datatype. NOTE: Allowing precisions to vary while
using 1m, which is what we do here, is unique to gemm; other level-3
operations use 1m only if all storage datatypes are equal (and they
ignore the computation precision). If any operands are real, skip the
induced method chooser function and proceed directly with native
execution. */ \
if ( bli_obj_is_complex( c ) && \
bli_obj_is_complex( a ) && \
bli_obj_is_complex( b ) ) \
{ \
/* GEMMT Todo: Currently we support only native implementation
for complex datatypes.*/ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( gemmt )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* Only proceed with an induced method if each of the operands have a
complex storage datatype. NOTE: Allowing precisions to vary while
using 1m, which is what we do here, is unique to gemm; other level-3
operations use 1m only if all storage datatypes are equal (and they
ignore the computation precision). If any operands are real, skip the
induced method chooser function and proceed directly with native
execution. */ \
if ( bli_obj_is_complex( c ) && \
bli_obj_is_complex( a ) && \
bli_obj_is_complex( b ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
}
GENFRONT( her2k )
GENFRONT( syr2k )
@@ -251,7 +65,7 @@ GENFRONT( syr2k )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
@@ -259,34 +73,11 @@ void PASTEMAC(opname,EX_SUF) \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
chooser function and proceed directly with native execution, which is
where mixed datatype support will be implemented (if at all). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_dt( b ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx, rntm ); \
} \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)\
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( side, alpha, a, b, beta, c, NULL, NULL ); \
}
GENFRONT( hemm )
@@ -297,157 +88,39 @@ GENFRONT( trmm3 )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If alpha or A or B has a zero dimension, \
* scale C by beta and return early. */ \
\
if( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) ) \
{ \
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If the rntm is non-NULL, it may indicate that we should forgo SUP handling altogether. */ \
bool enable_sup = TRUE; \
if( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler.
* If it finds that the problem does not fall within the
* thresholds that define "small", or for some other reason
* decides not to use the small/unpacked implementation,
* the function returns with BLIS_FAILURE, which causes excution
* to proceed forward towards conventional implementation, */ \
\
err_t result = PASTEMAC(opname, sup) ( alpha, a, beta, c, cntx, rntm ); \
if( result == BLIS_SUCCESS ) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
chooser function and proceed directly with native execution, which is
where mixed datatype support will be implemented (if at all). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( alpha, a, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \
} \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( alpha, a, beta, c, NULL, NULL ); \
}
GENFRONT( herk )
GENFRONT( syrk )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
chooser function and proceed directly with native execution, which is
where mixed datatype support will be implemented (if at all). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( alpha, a, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, beta, c, cntx, rntm ); \
} \
}
GENFRONT(herk)
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b \
BLIS_OAPI_EX_PARAMS \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* Only proceed with an induced method if all operands have the same
(complex) datatype. If any datatypes differ, skip the induced method
chooser function and proceed directly with native execution, which is
where mixed datatype support will be implemented (if at all). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \
bli_obj_is_complex( b ) ) \
{ \
/* Invoke the operation's "ind" function--its induced method front-end.
For complex problems, it calls the highest priority induced method
that is available (ie: implemented and enabled), and if none are
enabled, it calls native execution. (For real problems, it calls
the operation's native execution interface.) */ \
PASTEMAC(opname,ind)( side, alpha, a, b, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( side, alpha, a, b, cntx, rntm ); \
} \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2); \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( side, alpha, a, b, NULL, NULL ); \
}
GENFRONT( trmm )
GENFRONT( trsm )
#endif

View File

@@ -35,20 +35,19 @@
//
// Prototype object-based interfaces.
// Prototype object-based interfaces (basic).
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( gemm )
@@ -60,7 +59,7 @@ GENPROT( syr2k )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
@@ -68,7 +67,6 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( hemm )
@@ -79,13 +77,12 @@ GENPROT( trmm3 )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( herk )
@@ -95,13 +92,12 @@ GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( trmm )

View File

@@ -4,7 +4,8 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Texas at Austin
Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,13 +35,519 @@
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
//
// Define object-based interfaces (expert).
//
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
}\
\
/* if alpha or A or B has a zero dimension, \
scale C by beta and return early. */ \
if ( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) || \
bli_obj_has_zero_dim( b ) ) \
{\
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return;\
}\
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if ( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler. If it finds that the problem
does not fall within the thresholds that define "small", or for some
other reason decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If each matrix operand has a complex storage datatype, try to get an
induced method (if one is available and enabled). NOTE: Allowing
precisions to vary while using 1m, which is what we do here, is unique
to gemm; other level-3 operations use 1m only if all storage datatypes
are equal (and they ignore the computation precision). */ \
if ( bli_obj_is_complex( c ) && \
bli_obj_is_complex( a ) && \
bli_obj_is_complex( b ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
// Include the object API definitions here.
#include "bli_l3_oapi.c"
// If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be
// defined in the sandbox environment.
#ifndef BLIS_ENABLE_SANDBOX
GENFRONT( gemm )
#endif
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
}\
\
/* if alpha or A or B has a zero dimension, \
scale C by beta and return early. */ \
if ( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) || \
bli_obj_has_zero_dim( b ) ) \
{\
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return;\
}\
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if ( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler. If it finds that the problem
does not fall within the thresholds that define "small", or for some
other reason decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) \
{\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_dt( b ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( gemmt )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_dt( b ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, b, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( alpha, a, b, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( her2k )
GENFRONT( syr2k )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_dt( b ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( side, alpha, a, b, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( side, alpha, a, b, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( hemm )
GENFRONT( symm )
GENFRONT( trmm3 )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* If C has a zero dimension, return early. */ \
if ( bli_obj_has_zero_dim( c ) ) {\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If alpha or A or B has a zero dimension, \
scale C by beta and return early. */ \
\
if( bli_obj_equals( alpha, &BLIS_ZERO ) || \
bli_obj_has_zero_dim( a ) ) \
{ \
bli_scalm( beta, c ); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
\
/* If the rntm is non-NULL, it may indicate that we should forgo SUP handling altogether. */ \
bool enable_sup = TRUE; \
if( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if( enable_sup ) \
{ \
/* Execute the small/unpacked oapi handler.
If it finds that the problem does not fall within the
thresholds that define "small", or for some other reason
decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes excution
to proceed forward towards conventional implementation, */ \
\
err_t result = PASTEMAC(opname, sup) ( alpha, a, beta, c, cntx, rntm ); \
if( result == BLIS_SUCCESS ) { \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
return; \
} \
} \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( syrk )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( c ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \
bli_obj_is_complex( c ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, beta, c, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( alpha, a, beta, c, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( herk )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
bli_init_once(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Default to using native execution. */ \
num_t dt = bli_obj_dt( b ); \
ind_t im = BLIS_NAT; \
\
/* If all matrix operands are complex and of the same storage datatype, try
to get an induced method (if one is available and enabled). */ \
if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \
bli_obj_is_complex( b ) ) \
{ \
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */ \
im = PASTEMAC(opname,ind_find_avail)( dt ); \
} \
\
/* If necessary, obtain a valid context from the gks using the induced
method id determined above. */ \
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); \
\
/* Check the operands. */ \
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( side, alpha, a, b, cntx ); \
\
/* Invoke the operation's front-end and request the default control tree. */ \
PASTEMAC(opname,_front)( side, alpha, a, b, cntx, rntm, NULL ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
}
GENFRONT( trmm )
GENFRONT( trsm )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,21 +33,15 @@
*/
// Given the current architecture of BLIS sandboxes, bli_gemmnat() is the
// entry point to any sandbox implementation.
// NOTE: This function is implemented identically to the function that it
// overrides in frame/ind/oapi/bli_l3_nat_oapi.c. This means that we are
// forgoing the option of customizing the implementations that underlie
// bli_gemm() and bli_?gemm(). Any new code defined in this sandbox
// directory, however, will be included in the BLIS.
//
// Prototype object-based interfaces (expert).
//
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, cname, imeth ) \
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,imeth) \
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
@@ -55,34 +50,64 @@ void PASTEMAC(opname,imeth) \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
\
/* A switch to easily toggle whether we use the sandbox implementation
of bls_gemm() as the implementation for bli_gemm(). (This allows for
easy testing of bls_gemm() via the testsuite.) */ \
if ( 1 ) \
{ \
bls_gemm_ex( alpha, a, b, beta, c, cntx, rntm ); \
return; \
} \
\
bli_init_once(); \
\
/* Obtain a valid (native) context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
( \
alpha, a, b, beta, c, cntx, rntm, NULL \
); \
}
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm3 )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( herk )
GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( trmm )
GENPROT( trsm )
GENFRONT( gemm, gemm, nat )

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,13 +34,47 @@
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
)
{
// Begin with pack schemas for native execution.
pack_t schema_a = BLIS_PACKED_ROW_PANELS;
pack_t schema_b = BLIS_PACKED_COL_PANELS;
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// When executing the 1m method, choose the appropriate pack schemas based
// on the microkernel preference encoded within the current cntx_t (which
// was presumably returned by the gks).
if ( bli_cntx_method( cntx ) == BLIS_1M )
{
num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c );
// Include the object API definitions here.
#include "bli_l3_oapi.c"
// Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real
// projection of dt to query the preference of the corresponding native
// real-domain microkernel. This is what ultimately determines which
// variant of 1m is applicable.
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
schema_a = BLIS_PACKED_ROW_PANELS_1E;
schema_b = BLIS_PACKED_COL_PANELS_1R;
}
else
{
schema_a = BLIS_PACKED_ROW_PANELS_1R;
schema_b = BLIS_PACKED_COL_PANELS_1E;
}
}
// Embed the schemas into the objects for A and B. This is a sort of hack
// for communicating the desired pack schemas to bli_gemm_cntl_create()
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
// us to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
bli_obj_set_pack_schema( schema_a, a );
bli_obj_set_pack_schema( schema_b, b );
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,15 +32,10 @@
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l3_tapi.c"
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
);

View File

@@ -275,29 +275,6 @@ bli_thread_barrier( thread ); \
bli_thread_barrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2021, The University of Texas at Austin
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -33,18 +33,16 @@
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
#include "blis.h"
//
// Define BLAS-like interfaces with typed operands.
// Define BLAS-like interfaces with typed operands (basic).
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
trans_t transa, \
trans_t transb, \
@@ -56,56 +54,70 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
transa, \
transb, \
m, n, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( gemm )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( gemmt )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, struca ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -118,50 +130,24 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
uploa, \
conja, \
transb, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -172,7 +158,7 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -182,44 +168,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
uploc, \
transa, \
m, k, \
alpha, \
a, rs_a, cs_a, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -229,7 +192,7 @@ INSERT_GENTFUNCR_BASIC0( herk )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -241,50 +204,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -294,7 +230,7 @@ INSERT_GENTFUNCR_BASIC0( her2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -304,43 +240,21 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
uploc, \
transa, \
m, k, \
alpha, \
a, rs_a, cs_a, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -350,7 +264,7 @@ INSERT_GENTFUNC_BASIC0( syrk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -362,49 +276,23 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -414,7 +302,7 @@ INSERT_GENTFUNC_BASIC0( syr2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -428,51 +316,25 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
uploa, \
transa, \
diaga, \
transb, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
@@ -482,7 +344,7 @@ INSERT_GENTFUNC_BASIC0( trmm3 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -493,48 +355,25 @@ void PASTEMAC2(ch,opname,EX_SUF) \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
cntx, \
rntm \
uploa, \
transa, \
diaga, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( trmm )
INSERT_GENTFUNC_BASIC0( trsm )
#endif

View File

@@ -35,13 +35,13 @@
//
// Prototype BLAS-like interfaces with typed operands.
// Prototype BLAS-like interfaces with typed operands (basic).
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
trans_t transa, \
trans_t transb, \
@@ -53,16 +53,14 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( gemm )
INSERT_GENTPROT_BASIC0( gemmt )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -75,7 +73,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( hemm )
@@ -85,7 +82,7 @@ INSERT_GENTPROT_BASIC0( symm )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -95,7 +92,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROTR_BASIC0( herk )
@@ -104,7 +100,7 @@ INSERT_GENTPROTR_BASIC0( herk )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -116,7 +112,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROTR_BASIC0( her2k )
@@ -125,7 +120,7 @@ INSERT_GENTPROTR_BASIC0( her2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -135,7 +130,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( syrk )
@@ -144,7 +138,7 @@ INSERT_GENTPROT_BASIC0( syrk )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -156,16 +150,16 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( gemmt )
INSERT_GENTPROT_BASIC0( syr2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -179,7 +173,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( trmm3 )
@@ -188,7 +181,7 @@ INSERT_GENTPROT_BASIC0( trmm3 )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
@@ -199,7 +192,6 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( trmm )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,13 +35,553 @@
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
//
// Define BLAS-like interfaces with typed operands (expert).
//
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
// Include the typed API definitions here.
#include "bli_l3_tapi.c"
INSERT_GENTFUNC_BASIC0( gemm )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, struca ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN )
INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNCR_BASIC0( herk )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNCR_BASIC0( her2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( syrk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( syr2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( gemmt )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( trmm3 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( trmm )
INSERT_GENTFUNC_BASIC0( trsm )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -33,10 +34,14 @@
*/
//
// Prototype BLAS-like interfaces with typed operands (expert).
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
trans_t transa, \
trans_t transb, \
@@ -52,18 +57,12 @@ void PASTEMAC(ch,opname) \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( gemm3mh )
INSERT_GENTPROT_BASIC0( gemm3m1 )
INSERT_GENTPROT_BASIC0( gemm4mh )
INSERT_GENTPROT_BASIC0( gemm4mb )
INSERT_GENTPROT_BASIC0( gemm4m1 )
INSERT_GENTPROT_BASIC0( gemm1m )
INSERT_GENTPROT_BASIC0( gemm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
@@ -80,17 +79,34 @@ void PASTEMAC(ch,opname) \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( hemm3mh )
INSERT_GENTPROT_BASIC0( hemm3m1 )
INSERT_GENTPROT_BASIC0( hemm4mh )
INSERT_GENTPROT_BASIC0( hemm4m1 )
INSERT_GENTPROT_BASIC0( hemm1m )
INSERT_GENTPROT_BASIC0( hemm )
INSERT_GENTPROT_BASIC0( symm )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROTR_BASIC0( herk )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -103,71 +119,36 @@ void PASTEMAC(ch,opname) \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntmx \
rntm_t* rntm \
);
INSERT_GENTPROTR_BASIC0( her2k3mh )
INSERT_GENTPROTR_BASIC0( her2k3m1 )
INSERT_GENTPROTR_BASIC0( her2k4mh )
INSERT_GENTPROTR_BASIC0( her2k4m1 )
INSERT_GENTPROTR_BASIC0( her2k1m )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntmx \
);
INSERT_GENTPROTR_BASIC0( herk3mh )
INSERT_GENTPROTR_BASIC0( herk3m1 )
INSERT_GENTPROTR_BASIC0( herk4mh )
INSERT_GENTPROTR_BASIC0( herk4m1 )
INSERT_GENTPROTR_BASIC0( herk1m )
INSERT_GENTPROTR_BASIC0( her2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( symm3mh )
INSERT_GENTPROT_BASIC0( symm3m1 )
INSERT_GENTPROT_BASIC0( symm4mh )
INSERT_GENTPROT_BASIC0( symm4m1 )
INSERT_GENTPROT_BASIC0( symm1m )
INSERT_GENTPROT_BASIC0( syrk )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
@@ -183,41 +164,14 @@ void PASTEMAC(ch,opname) \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( syr2k3mh )
INSERT_GENTPROT_BASIC0( syr2k3m1 )
INSERT_GENTPROT_BASIC0( syr2k4mh )
INSERT_GENTPROT_BASIC0( syr2k4m1 )
INSERT_GENTPROT_BASIC0( syr2k1m )
INSERT_GENTPROT_BASIC0( gemmt )
INSERT_GENTPROT_BASIC0( syr2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( syrk3mh )
INSERT_GENTPROT_BASIC0( syrk3m1 )
INSERT_GENTPROT_BASIC0( syrk4mh )
INSERT_GENTPROT_BASIC0( syrk4m1 )
INSERT_GENTPROT_BASIC0( syrk1m )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
@@ -235,17 +189,13 @@ void PASTEMAC(ch,opname) \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( trmm33mh )
INSERT_GENTPROT_BASIC0( trmm33m1 )
INSERT_GENTPROT_BASIC0( trmm34mh )
INSERT_GENTPROT_BASIC0( trmm34m1 )
INSERT_GENTPROT_BASIC0( trmm31m )
INSERT_GENTPROT_BASIC0( trmm3 )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
@@ -260,30 +210,6 @@ void PASTEMAC(ch,opname) \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( trmm3m1 )
INSERT_GENTPROT_BASIC0( trmm4m1 )
INSERT_GENTPROT_BASIC0( trmm1m )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( trsm3m1 )
INSERT_GENTPROT_BASIC0( trsm4m1 )
INSERT_GENTPROT_BASIC0( trsm1m )
INSERT_GENTPROT_BASIC0( trmm )
INSERT_GENTPROT_BASIC0( trsm )

View File

@@ -54,10 +54,6 @@ void bli_gemm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
@@ -79,6 +75,29 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
// Don't perform the following optimization for ccr or crc cases, as
// those cases are sensitive to the ukernel storage preference (ie:
// transposing the operation would break them).
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
#endif
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
@@ -98,24 +117,8 @@ void bli_gemm_front
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
}
//else // homogeneous datatypes
#endif
// Load the pack schemas from the context and embed them into the objects
// for A and B. (Native contexts are initialized with the correct pack
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.

View File

@@ -89,6 +89,29 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
// Don't perform the following optimization for ccr or crc cases, as
// those cases are sensitive to the ukernel storage preference (ie:
// transposing the operation would break them).
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
#endif
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
@@ -111,21 +134,6 @@ void bli_gemm_front
//else // homogeneous datatypes
#endif
// Load the pack schemas from the context and embed them into the objects
// for A and B. (Native contexts are initialized with the correct pack
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.

View File

@@ -60,7 +60,8 @@ void bli_gemm_int
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) ) {
if ( bli_obj_has_zero_dim( c ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return;
}
@@ -69,9 +70,9 @@ void bli_gemm_int
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return;
}
@@ -84,9 +85,9 @@ void bli_gemm_int
// This should never execute.
bli_abort();
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return;
}
@@ -100,14 +101,14 @@ void bli_gemm_int
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Create the next node in the thrinfo_t structure.
@@ -116,17 +117,6 @@ void bli_gemm_int
// Extract the function pointer from the current control tree node.
f = bli_cntl_var_func( cntl );
// Somewhat hackish support for 4m1b method implementation.
{
ind_t im = bli_cntx_method( cntx );
if ( im != BLIS_NAT )
{
if ( im == BLIS_4M1B )
if ( f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2;
}
}
// Invoke the variant.
f
(
@@ -136,7 +126,7 @@ void bli_gemm_int
cntx,
rntm,
cntl,
thread
thread
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);

View File

@@ -224,7 +224,17 @@ void PASTEMAC(ch,varname) \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
function pointer type. Note that the virtual gemm ukernel is queried
instead of the native gemm ukernel. This is needed for certain
situations for the 1m method that require an extra layer of logic
to allow for handling (for example) complex values of beta. Also
note that under certain circumstances, the real-domain version of
this macrokernel will be called for 1m (NOT the complex version)
as an optimization. In these cases, the corresponding real-domain
slots within the cntx_t's virtual gemm ukernel func_t will contain
pointers to the *native* gemm ukernel, thanks to logic in the
context initialization function for the induced method (defined
in bli_cntx_ref.c). */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\

View File

@@ -368,8 +368,6 @@ void PASTEMAC2(chc,che,varname) \
then accumulate it into C via the xpbys_mxn macro. */ \
/*if ( 1 )*/ \
{ \
/*bli_auxinfo_set_dt_on_output( dte, &aux );*/ \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
@@ -392,48 +390,6 @@ void PASTEMAC2(chc,che,varname) \
c11, rs_c, cs_c \
); \
} \
/*
else if ( m_cur == MR && n_cur == NR ) \
{ \
bli_auxinfo_set_dt_on_output( dtc, &aux ); \
\
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
( ctype_e* )beta_cast, \
( ctype_e* )c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
bli_auxinfo_set_dt_on_output( dte, &aux ); \
\
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
PASTEMAC3(che,chc,chc,xpbys_mxn) \
( \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c \
); \
} \
*/ \
} \
} \
\

View File

@@ -187,6 +187,10 @@ mddm_t bli_gemm_md_ccr
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
}
@@ -230,7 +234,7 @@ mddm_t bli_gemm_md_ccr
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
@@ -288,6 +292,10 @@ mddm_t bli_gemm_md_crc
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
}
@@ -331,7 +339,7 @@ mddm_t bli_gemm_md_crc
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
@@ -405,8 +413,8 @@ mddm_t bli_gemm_md_rcc
// Use the 1r pack schema for both A and B with the conjugation
// of A or B toggled (to produce ar * br - ai * bi).
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, *cntx );
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, *cntx );
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
bli_obj_toggle_conj( b );
@@ -485,7 +493,7 @@ mddm_t bli_gemm_md_crr
}
#endif
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -523,7 +531,7 @@ mddm_t bli_gemm_md_rcr
// Overwrite the complex obj_t with its real-only alias.
*a = a_real;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -561,7 +569,7 @@ mddm_t bli_gemm_md_rrc
// Overwrite the complex obj_t with its real-only alias.
*b = b_real;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -591,7 +599,7 @@ mddm_t bli_gemm_md_rrr
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
@@ -621,248 +629,10 @@ mddm_t bli_gemm_md_ccc
doms.comp = BLIS_COMPLEX;
doms.exec = BLIS_COMPLEX;
// Use the default pack schemas in the context.
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
#if 0
void bli_gemm_md_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
cntx_t cntx_local;
// Handle mixed domain cases in bli_gemm_md(), which may modify
// the objects or the context. (If the context is modified, cntx
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
// Record the threading for each level within the context.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
// -----------------------------------------------------------------------------
void bli_gemm_md_zgemm
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
#if 1
obj_t am, bm, cm;
obj_t* c_orig;
//if ( is_md == TRUE )
{
//num_t dt_c2 = bli_obj_dt( c );
//num_t dt_c1 = bli_dt_proj_to_complex( dt_c2 );
//num_t dt_c = bli_dt_proj_to_double_prec( dt_c1 );
//num_t dt_c = bli_obj_dt_proj_to_complex( c );
num_t dt_c = BLIS_DCOMPLEX;
if ( bli_obj_is_single_prec( c ) ) dt_c = BLIS_SCOMPLEX;
else dt_c = BLIS_DCOMPLEX;
if ( bli_obj_is_real( a ) &&
bli_obj_is_real( b ) &&
bli_obj_is_real( c ) ) dt_c = bli_dt_proj_to_real( dt_c );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width_after_trans( a );
bli_obj_create( dt_c, m, k, 0, 0, &am );
bli_obj_create( dt_c, k, n, 0, 0, &bm );
bli_obj_create( dt_c, m, n, 0, 0, &cm );
//bli_projm( a, &am );
//bli_projm( b, &bm );
//bli_projm( c, &cm );
bli_castm( a, &am );
bli_castm( b, &bm );
bli_castm( c, &cm );
c_orig = c;
a = &am;
b = &bm;
c = &cm;
}
#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
{
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
{
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
}
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
#if 1
//if ( is_md == TRUE )
{
//bli_projm( &cm, c_orig );
bli_castm( &cm, c_orig );
bli_obj_free( &am );
bli_obj_free( &bm );
bli_obj_free( &cm );
}
#endif
}
#endif
#endif

View File

@@ -62,9 +62,6 @@ GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )
// Headers for induced algorithms:
GENPROT( gemm4mb_ker_var2 ) // 4m1b
//
// Prototype BLAS-like interfaces with void pointer operands.
@@ -94,6 +91,3 @@ void PASTEMAC(ch,varname) \
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
// Headers for induced algorithms:
INSERT_GENTPROT_BASIC0( gemm4mb_ker_var2 ) // 4m1b

View File

@@ -1,365 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm4mb_ker_var2);
void bli_gemm4mb_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t ii; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
dim_t jr_inc = jr_num_threads; \
dim_t ir_inc = ir_num_threads; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* In the 4mb method, we execute the ir loop twice: once for b_r
and once for b_i. */ \
for ( ii = 0; ii < 2; ++ii ) \
{ \
ctype* restrict beta_use; \
\
if ( ii == 0 ) \
{ \
bli_auxinfo_set_schema_b( BLIS_PACKED_COL_PANELS_RO, &aux ); \
beta_use = beta_cast; \
} \
else \
{ \
bli_auxinfo_set_schema_b( BLIS_PACKED_COL_PANELS_IO, &aux ); \
beta_use = one; \
} \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3 (4m1b): c before", 8, 6, c11, rs_c, cs_c, "%4.1f", "" );*/ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_use, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3 (4m1b): c after", 8, 6, c11, rs_c, cs_c, "%4.1f", "" );*/ \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_use, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
/*printf( "gemm_ker_var3 (4m1b): returning\n" );*/ \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( gemm4mb_ker_var2 )

View File

@@ -1,363 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm3m2_ker_var2);
void bli_gemm3m2_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t ii; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* In the 3m2 method, we execute the ir loop thrice: once for
a_r[ir] * b_r, once for a_i[ir] * b_i, and once for
a_{r+i}[ir] * b_{r+i}. */ \
for ( ii = 0; ii < 3; ++ii ) \
{ \
ctype* restrict beta_use; \
\
if ( ii == 0 ) \
{ \
bli_auxinfo_set_schema_a( BLIS_PACKED_ROW_PANELS_RO, &aux ); \
bli_auxinfo_set_schema_b( BLIS_PACKED_COL_PANELS_RO, &aux ); \
beta_use = beta_cast; \
} \
else if ( ii == 1 ) \
{ \
bli_auxinfo_set_schema_a( BLIS_PACKED_ROW_PANELS_IO, &aux ); \
bli_auxinfo_set_schema_b( BLIS_PACKED_COL_PANELS_IO, &aux ); \
beta_use = one; \
} \
else \
{ \
bli_auxinfo_set_schema_a( BLIS_PACKED_ROW_PANELS_RPI, &aux ); \
bli_auxinfo_set_schema_b( BLIS_PACKED_COL_PANELS_RPI, &aux ); \
beta_use = one; \
} \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_use, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_use, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( gemm3m2_ker_var2 )

View File

@@ -54,10 +54,21 @@ void bli_gemmt_front
obj_t b_local;
obj_t c_local;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
@@ -86,20 +97,8 @@ void bli_gemmt_front
//else // homogeneous datatypes
#endif
// Load the pack schemas from the context and embed them into the objects
// for A and B. (Native contexts are initialized with the correct pack
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
// have made a copy and modified the schemas, so reading them from the
// context should be a safe bet at this point.) This is a sort of hack for
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
// to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Set the pack schemas within the objects, as appropriate.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.

View File

@@ -54,10 +54,6 @@ void bli_hemm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_hemm_check( side, alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
@@ -131,6 +127,9 @@ void bli_hemm_front
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -151,17 +150,6 @@ void bli_hemm_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -56,10 +56,6 @@ void bli_her2k_front
obj_t b_local;
obj_t ah_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her2k_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta, zero the imaginary components of
// the diagonal elements, and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
@@ -84,12 +80,6 @@ void bli_her2k_front
bli_obj_induce_trans( &ah_local );
bli_obj_toggle_conj( &ah_local );
// Initialize a conjugated copy of alpha.
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
BLIS_CONJUGATE,
alpha,
&alpha_conj );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
@@ -107,6 +97,16 @@ void bli_her2k_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &bh_local, &c_local, cntx );
bli_l3_set_schemas( &b_local, &ah_local, &c_local, cntx );
// Initialize a conjugated copy of alpha.
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( a ),
BLIS_CONJUGATE,
alpha,
&alpha_conj );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -120,19 +120,6 @@ void bli_her2k_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &bh_local );
bli_obj_set_pack_schema( schema_a, &b_local );
bli_obj_set_pack_schema( schema_b, &ah_local );
// Invoke herk twice, using beta only the first time.
// Invoke the internal back-end.

View File

@@ -52,10 +52,6 @@ void bli_herk_front
obj_t ah_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_herk_check( alpha, a, beta, c, cntx );
// If alpha is zero, scale by beta, zero the imaginary components of
// the diagonal elements, and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
@@ -87,6 +83,9 @@ void bli_herk_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &ah_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -100,17 +99,6 @@ void bli_herk_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &ah_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -279,9 +279,6 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the

View File

@@ -281,9 +281,6 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the

View File

@@ -54,10 +54,6 @@ void bli_symm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_symm_check( side, alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
@@ -130,6 +126,9 @@ void bli_symm_front
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -150,17 +149,6 @@ void bli_symm_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -55,10 +55,6 @@ void bli_syr2k_front
obj_t b_local;
obj_t at_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr2k_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
@@ -88,6 +84,10 @@ void bli_syr2k_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &bt_local, &c_local, cntx );
bli_l3_set_schemas( &b_local, &at_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -101,19 +101,6 @@ void bli_syr2k_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &bt_local );
bli_obj_set_pack_schema( schema_a, &b_local );
bli_obj_set_pack_schema( schema_b, &at_local );
// Invoke herk twice, using beta only the first time.
// Invoke the internal back-end.

View File

@@ -61,9 +61,13 @@ void bli_syrk_front
bli_obj_alias_to( a, &at_local );
bli_obj_induce_trans( &at_local );
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syrk_check( alpha, a, beta, c, cntx );
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX
gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local,
cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
#endif
#endif
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
@@ -81,6 +85,9 @@ void bli_syrk_front
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &at_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -94,17 +101,6 @@ void bli_syrk_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &at_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -52,10 +52,6 @@ void bli_trmm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
@@ -148,6 +144,9 @@ void bli_trmm_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -168,17 +167,6 @@ void bli_trmm_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -54,7 +54,7 @@ void bli_trmm_front
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
bli_trmm_check( side, alpha, a, b, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
@@ -148,6 +148,9 @@ void bli_trmm_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -177,17 +180,6 @@ void bli_trmm_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -203,9 +203,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
@@ -243,30 +240,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
@@ -317,9 +290,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
@@ -387,12 +357,12 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
ps_a_cur = is_a_cur; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
@@ -408,10 +378,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -479,10 +445,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \

View File

@@ -203,9 +203,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
@@ -243,30 +240,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
@@ -278,7 +251,7 @@ void PASTEMAC(ch,varname) \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
b_cast = b_cast + i * PACKNR; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
@@ -324,9 +297,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
@@ -394,12 +364,12 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
ps_a_cur = is_a_cur; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
b1_i = b1 + off_a1112 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
@@ -415,10 +385,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -486,10 +452,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \

View File

@@ -203,9 +203,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
@@ -243,30 +240,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
@@ -278,7 +251,7 @@ void PASTEMAC(ch,varname) \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
a_cast = a_cast + j * PACKMR; \
} \
\
/* If there is a zero region to the right of where the diagonal
@@ -324,9 +297,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
@@ -387,10 +357,6 @@ void PASTEMAC(ch,varname) \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
@@ -504,13 +470,9 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
ps_b_cur = is_b_cur; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -522,7 +484,7 @@ void PASTEMAC(ch,varname) \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
a1_i = a1 + off_b1121 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \

View File

@@ -203,9 +203,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
@@ -243,30 +240,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
@@ -325,9 +298,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
@@ -409,13 +379,9 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
ps_b_cur = is_b_cur; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -427,7 +393,7 @@ void PASTEMAC(ch,varname) \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
a1_i = a1 + off_b0111 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
@@ -542,10 +508,6 @@ void PASTEMAC(ch,varname) \
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \

View File

@@ -54,10 +54,6 @@ void bli_trmm3_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
@@ -141,6 +137,9 @@ void bli_trmm3_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -161,17 +160,6 @@ void bli_trmm3_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -56,9 +56,12 @@ void bli_trsm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsm_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
#endif
#endif
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
@@ -120,6 +123,9 @@ void bli_trsm_front
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
@@ -145,25 +151,6 @@ void bli_trsm_front
rntm
);
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT )
{
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
}
// Invoke the internal back-end.
bli_l3_thread_decorator
(

View File

@@ -217,9 +217,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
@@ -265,29 +262,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
@@ -355,9 +329,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
@@ -427,18 +398,18 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
ps_a_cur = is_a_cur; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
a11 = a1 + k_a10 * PACKMR; \
/*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b01 = b1 + off_a10 * PACKNR; \
b11 = b1 + off_a11 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
@@ -454,10 +425,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -518,10 +485,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -569,44 +532,11 @@ void PASTEMAC(ch,varname) \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \

View File

@@ -218,9 +218,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
@@ -266,29 +263,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
@@ -300,7 +274,7 @@ void PASTEMAC(ch,varname) \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
b_cast = b_cast + i * PACKNR; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
@@ -363,9 +337,6 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
@@ -437,18 +408,18 @@ void PASTEMAC(ch,varname) \
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
ps_a_cur = is_a_cur; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
a12 = a1 + k_a11 * PACKMR; \
/*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
b11 = b1 + off_a11 * PACKNR; \
b21 = b1 + off_a12 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
@@ -464,10 +435,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
@@ -528,10 +495,6 @@ void PASTEMAC(ch,varname) \
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \

View File

@@ -224,9 +224,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
@@ -280,29 +277,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
@@ -314,7 +288,7 @@ void PASTEMAC(ch,varname) \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
a_cast = a_cast + j * PACKMR; \
} \
\
/* If there is a zero region to the right of where the diagonal
@@ -386,9 +360,6 @@ void PASTEMAC(ch,varname) \
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
b1 = b_cast; \
c1 = c_cast; \
@@ -430,20 +401,14 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
b11 = b1; \
/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
b11 = b1; \
b21 = b1 + k_b11 * PACKNR; \
/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
ps_b_cur = is_b_cur; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -457,8 +422,8 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
a11 = a1 + off_b11 * PACKMR; \
a12 = a1 + off_b21 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
@@ -525,12 +490,6 @@ void PASTEMAC(ch,varname) \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \

View File

@@ -222,9 +222,6 @@ void PASTEMAC(ch,varname) \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
@@ -278,29 +275,6 @@ void PASTEMAC(ch,varname) \
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
@@ -380,9 +354,6 @@ void PASTEMAC(ch,varname) \
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
b1 = b_cast; \
c1 = c_cast; \
@@ -422,20 +393,14 @@ void PASTEMAC(ch,varname) \
\
/* Compute the addresses of the panel B10 and the triangular
block B11. */ \
b01 = b1; \
/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
b01 = b1; \
b11 = b1 + k_b01 * PACKNR; \
/*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
ps_b_cur = is_b_cur; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -449,8 +414,8 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A10 panel and A11 block. */ \
a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
a10 = a1 + off_b01 * PACKMR; \
a11 = a1 + off_b11 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
@@ -517,12 +482,6 @@ void PASTEMAC(ch,varname) \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \

View File

@@ -74,13 +74,6 @@ BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
return ai->ps_b;
}
#if 0
BLIS_INLINE inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai )
{
return ai->dt_on_output;
}
#endif
// auxinfo_t field modification
@@ -125,12 +118,5 @@ BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
ai->ps_b = ps;
}
#if 0
BLIS_INLINE void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai )
{
ai->dt_on_output = dt_on_output;
}
#endif
#endif

View File

@@ -224,12 +224,6 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
double msclr = msclrs[ i ];
blksz_t* blksz = blkszs[ i ];
// NOTE: This is a bug! We need to grab the actual blocksize
// multiple, which is not at blkszs[i], but rather somewhere else
// in the array. In order to fix this, you probably need to store
// the contents of blkszs (and all the other arrays) by bs_id
// rather than i in the first loop.
blksz_t* bmult = blkszs[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
@@ -248,20 +242,6 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
// perform this rounding when the blocksize id is not equal to
// the blocksize multiple id (ie: we don't round down scaled
// register blocksizes since they are their own multiples).
// Also, we skip the rounding for 1m since it should never need
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_def_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
@@ -272,20 +252,6 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
// perform this rounding when the blocksize id is not equal to
// the blocksize multiple id (ie: we don't round down scaled
// register blocksizes since they are their own multiples).
// Also, we skip the rounding for 1m since it should never need
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_max_to( BLIS_FLOAT, bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, bmult, BLIS_DCOMPLEX, cntx_blksz );
}
}
// Copy the blocksize multiple id into the context.
@@ -323,13 +289,14 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
// -----------------------------------------------------------------------------
void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
{
/* Example prototypes:
void bli_gks_cntx_set_ind_blkszs
(
ind_t method != BLIS_NAT,
num_t dt,
dim_t n_bs,
bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
@@ -346,6 +313,9 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
dim_t i;
err_t r_val;
// Project the given datatype to the real domain. This will be used later on.
num_t dt_real = bli_dt_proj_to_real( dt );
// Return early if called with BLIS_NAT.
if ( method == BLIS_NAT ) return;
@@ -418,77 +388,35 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... )
//blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Query the blocksize multiple's blocksize id.
bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx );
// Query the context for the blksz_t object assoicated with the
// current blocksize id, and also query the object corresponding
// to the blocksize multiple.
blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
blksz_t* cntx_bmult = bli_cntx_get_bmult( bs_id, cntx );
blksz_t* cntx_trsm_blksz = bli_cntx_get_trsm_blksz( bs_id, cntx );
// Copy the real domain values of the blksz_t object into the
// the complex domain slots of the same object.
bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_FLOAT, cntx_blksz, BLIS_SCOMPLEX, cntx_trsm_blksz);
bli_blksz_copy_dt( BLIS_DOUBLE, cntx_blksz, BLIS_DCOMPLEX, cntx_trsm_blksz);
// Copy the real domain value of the blksz_t object into the
// corresponding complex domain slot of the same object.
bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_trsm_blksz );
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the complex domain default blocksize values in the
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_trsm_blksz);
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_trsm_blksz);
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
// perform this rounding when the blocksize id is not equal to
// the blocksize multiple id (ie: we don't round down scaled
// register blocksizes since they are their own multiples).
// Also, we skip the rounding for 1m since it should never need
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_reduce_def_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_trsm_blksz );
bli_blksz_reduce_def_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_trsm_blksz );
}
// Scale the default blocksize value corresponding to the given
// datatype.
bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_trsm_blksz );
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the complex domain maximum blocksize values in the
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_trsm_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_trsm_blksz );
// Perform rounding to ensure the newly scaled values are still
// multiples of their register blocksize multiples. But only
// perform this rounding when the blocksize id is not equal to
// the blocksize multiple id (ie: we don't round down scaled
// register blocksizes since they are their own multiples).
// Also, we skip the rounding for 1m since it should never need
// such rounding.
if ( bs_id != bm_id && method != BLIS_1M )
{
// Round the newly-scaled blocksizes down to their multiple.
bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_blksz );
bli_blksz_reduce_max_to( BLIS_FLOAT, cntx_bmult, BLIS_SCOMPLEX, cntx_trsm_blksz );
bli_blksz_reduce_max_to( BLIS_DOUBLE, cntx_bmult, BLIS_DCOMPLEX, cntx_trsm_blksz );
}
// Scale the maximum blocksize value corresponding to the given
// datatype.
bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_trsm_blksz );
}
}
}

View File

@@ -63,9 +63,6 @@ typedef struct cntx_s
func_t* unpackm_kers;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
} cntx_t;
*/
@@ -156,18 +153,6 @@ BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
{
return cntx->method;
}
BLIS_INLINE pack_t bli_cntx_schema_a_block( cntx_t* cntx )
{
return cntx->schema_a_block;
}
BLIS_INLINE pack_t bli_cntx_schema_b_panel( cntx_t* cntx )
{
return cntx->schema_b_panel;
}
BLIS_INLINE pack_t bli_cntx_schema_c_panel( cntx_t* cntx )
{
return cntx->schema_c_panel;
}
// -----------------------------------------------------------------------------
@@ -179,23 +164,6 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
{
cntx->method = method;
}
BLIS_INLINE void bli_cntx_set_schema_a_block( pack_t schema, cntx_t* cntx )
{
cntx->schema_a_block = schema;
}
BLIS_INLINE void bli_cntx_set_schema_b_panel( pack_t schema, cntx_t* cntx )
{
cntx->schema_b_panel = schema;
}
BLIS_INLINE void bli_cntx_set_schema_c_panel( pack_t schema, cntx_t* cntx )
{
cntx->schema_c_panel = schema;
}
BLIS_INLINE void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cntx )
{
bli_cntx_set_schema_a_block( sa, cntx );
bli_cntx_set_schema_b_panel( sb, cntx );
}
// -----------------------------------------------------------------------------
@@ -942,7 +910,7 @@ BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );

View File

@@ -50,7 +50,7 @@ static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ];
// Define a function pointer type for context initialization functions.
typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
typedef void (*ind_cntx_init_ft)( ind_t method, num_t dt, cntx_t* cntx );
typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
// -----------------------------------------------------------------------------
@@ -637,7 +637,7 @@ cntx_t* bli_gks_query_ind_cntx
// function for the current induced method. (That function assumes
// that the context is pre- initialized with values for native
// execution.)
f( ind, dt, gks_id_ind );
f( ind, gks_id_ind );
}
}
// END CRITICAL SECTION

View File

@@ -36,11 +36,6 @@
static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
{
/* 3mh */ "3mh",
/* 3m1 */ "3m1",
/* 4mh */ "4mh",
/* 4m1b */ "4m1b",
/* 4m1a */ "4m1a",
/* 1m */ "1m",
/* nat */ "native",
};
@@ -147,8 +142,9 @@ bool bli_ind_oper_is_impl( opid_t oper, ind_t method )
if ( bli_opid_is_level3( oper ) )
{
// Look up whether its func_t pointer in the table is NULL.
is_impl = ( bli_l3_ind_oper_get_func( oper, method ) != NULL );
// Look up whether the operation is implemented for the given induced
// method id.
is_impl = bli_l3_ind_oper_is_impl( oper, method );
}
else
{
@@ -162,39 +158,6 @@ bool bli_ind_oper_is_impl( opid_t oper, ind_t method )
return is_impl;
}
#if 0
bool bli_ind_oper_has_avail( opid_t oper, num_t dt )
{
ind_t method = bli_ind_oper_find_avail( oper, dt );
if ( method == BLIS_NAT ) return FALSE;
else return TRUE;
}
#endif
void_fp bli_ind_oper_get_avail( opid_t oper, num_t dt )
{
void_fp func_p;
if ( bli_opid_is_level3( oper ) )
{
ind_t method = bli_ind_oper_find_avail( oper, dt );
func_p = bli_l3_ind_oper_get_func( oper, method );
}
else
{
// Currently, any operation that is not level-3 does not
// have induced method implementations. (This should actually
// assign the pointer to be the native front-end, but for
// now there are no calls to bli_ind_oper_get_avail() in the
// context of level-2 operations.
func_p = NULL;
}
return func_p;
}
ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt )
{
ind_t method;

View File

@@ -38,16 +38,6 @@
// level-3 induced method management
#include "bli_l3_ind.h"
// level-3 object APIs
#include "bli_l3_ind_oapi.h"
// level-3 typed APIs
#include "bli_l3_ind_tapi.h"
// level-3 cntx initialization
#include "bli_cntx_ind_stage.h"
void bli_ind_init( void );
void bli_ind_finalize( void );
@@ -62,8 +52,6 @@ BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt );
BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method );
//bool bli_ind_oper_has_avail( opid_t oper, num_t dt );
BLIS_EXPORT_BLIS void_fp bli_ind_oper_get_avail( opid_t oper, num_t dt );
BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt );
BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );

View File

@@ -538,8 +538,23 @@ void dzgemm_blis_impl
bli_obj_set_conjtrans( blis_transb, &bo );
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Mix of real and complex matrix data types, so assuming
induced methods will not be available */
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(gemm,_front)( &alphao, &ao, &bo, &betao, &co, cntx, &rntm_l, NULL );
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -58,79 +58,95 @@ void PASTEF77S(ch,blasname) \
ftype* c, const f77_int* ldc \
) \
{ \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
inc_t rs_a, cs_a; \
inc_t rs_b, cs_b; \
inc_t rs_c, cs_c; \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
inc_t rs_a, cs_a; \
inc_t rs_b, cs_b; \
inc_t rs_c, cs_c; \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
/* Initialize BLIS. */ \
bli_init_auto(); \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Quick return if possible. */ \
if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \
&& PASTEMAC(ch,eq1)( *beta ) )) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return; \
} \
/* Quick return if possible. */ \
if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \
&& PASTEMAC(ch,eq1)( *beta ) )) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return; \
} \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
rs_a = 1; \
cs_a = *lda; \
rs_b = 1; \
cs_b = *ldb; \
rs_c = 1; \
cs_c = *ldc; \
/* Set the row and column strides of the matrix operands. */ \
rs_a = 1; \
cs_a = *lda; \
rs_b = 1; \
cs_b = *ldb; \
rs_c = 1; \
cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
blis_transb, \
m0, \
n0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
/* As a placeholder, invoke 1m since BLIS does no longer contains an
official 3m implementation. Note that we do this by inlining an
abbreviated version of bli_gemm_ex() so that we can bypass
consideration of sup, which doesn't make sense in this context. */ \
{ \
cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
rntm_t rntm_l; \
rntm_t* rntm = &rntm_l; \
bli_rntm_init_from_global( rntm ); \
\
/* Note that we MUST disable sup handling since it could redirect
execution for some problem sizes to a non-3m implementation. */ \
bli_rntm_disable_l3_sup( rntm ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
blis_transb, \
m0, \
n0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
cntx, \
rntm \
); \
} \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
} \
IF_BLIS_ENABLE_BLAS(\
void PASTEF77(ch,blasname) \
@@ -147,7 +163,7 @@ void PASTEF77(ch,blasname) \
ftype* c, const f77_int* ldc \
) \
{ \
PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
} \
)
@@ -170,94 +186,114 @@ void PASTEF77S(ch,blasname) \
ftype* c, const f77_int* ldc \
) \
{ \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Quick return if possible. */ \
if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \
&& PASTEMAC(ch,eq1)( *beta ) )) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return; \
} \
/* Quick return if possible. */ \
if ( *m == 0 || *n == 0 || (( PASTEMAC(ch,eq0)( *alpha ) || *k == 0) \
&& PASTEMAC(ch,eq1)( *beta ) )) \
{ \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return; \
} \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( blis_transb, &bo ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( blis_transb, &bo ); \
\
PASTEMAC(blisname,ind) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
/* As a placeholder, invoke 1m since BLIS does no longer contains an
official 3m implementation. Note that we do this by inlining an
abbreviated version of bli_gemm_ex() so that we can bypass
consideration of sup, which doesn't make sense in this context. */ \
{ \
cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
rntm_t rntm_l; \
rntm_t* rntm = &rntm_l; \
bli_rntm_init_from_global( &rntm_l ); \
\
/* This is probably not needed given that we performed BLAS-style
parameter checking above, but bli_gemm_check() is normally called
in the normal course of bli_gemm_ex(). */ \
if ( bli_error_checking_is_enabled() ) \
bli_gemm_check( &alphao, &ao, &bo, &betao, &co, cntx ); \
\
PASTEMAC(blisname,_front) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm, \
NULL \
); \
} \
\
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
} \
IF_BLIS_ENABLE_BLAS(\
void PASTEF77(ch,blasname) \
@@ -274,7 +310,7 @@ void PASTEF77(ch,blasname) \
ftype* c, const f77_int* ldc \
) \
{ \
PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
} \
)

View File

@@ -839,8 +839,20 @@ void dgemm_blis_impl
}
// fall back on native path when dgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(gemm,_front)( &alphao, &ao, &bo, &betao, &co, cntx, &rntm_l, NULL );
/* PASTEMAC(gemm, BLIS_OAPI_EX_SUF) */
/* ( */
@@ -1212,7 +1224,32 @@ void zgemm_blis_impl
}
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* As each matrix operand has a complex storage datatype, try to get an
induced method (if one is available and enabled). NOTE: Allowing
precisions to vary while using 1m, which is what we do here, is unique
to gemm; other level-3 operations use 1m only if all storage datatypes
are equal (and they ignore the computation precision). */
/* Find the highest priority induced method that is both enabled and
available for the current operation. (If an induced method is
available but not enabled, or simply unavailable, BLIS_NAT will
be returned here.) */
im = bli_gemmind_find_avail( dt );
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(gemm,_front)( &alphao, &ao, &bo, &betao, &co, cntx, &rntm_l, NULL );
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
@@ -1367,7 +1404,23 @@ void dzgemm_blis_impl
bli_obj_set_conjtrans( blis_transb, &bo );
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Mix of real and complex matrix data types, so assuming
induced methods will not be available */
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(gemm,_front)( &alphao, &ao, &bo, &betao, &co, cntx, &rntm_l, NULL );
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);

View File

@@ -782,15 +782,28 @@ void strsm_blis_impl
}
#endif
bli_trsmnat
(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL
);
//bli_trsmnat
//(
// blis_side,
// &alphao,
// &ao,
// &bo,
// NULL,
// NULL
//);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(trsm,_front)( blis_side, &alphao, &ao, &bo, cntx, &rntm_l, NULL ); \
AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), *side, *m, *n);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
@@ -1176,15 +1189,29 @@ void dtrsm_blis_impl
} // bli_cpuid_is_avx2fma3_supported
#endif// END of BLIS_ENABLE_SMALL_MATRIX_TRSM
bli_trsmnat
(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL
);
//bli_trsmnat
//(
// blis_side,
// &alphao,
// &ao,
// &bo,
// NULL,
// NULL
//);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(trsm,_front)( blis_side, &alphao, &ao, &bo, cntx, &rntm_l, NULL ); \
AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *side, *m, *n);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
/* Finalize BLIS. */
@@ -1626,15 +1653,28 @@ void ztrsm_blis_impl
} // bli_cpuid_is_avx2fma3_supported
#endif// END of BLIS_ENABLE_SMALL_MATRIX_TRSM
bli_trsmnat
(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL
);
//bli_trsmnat
//(
// blis_side,
// &alphao,
// &ao,
// &bo,
// NULL,
// NULL
//);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(trsm,_front)( blis_side, &alphao, &ao, &bo, cntx, &rntm_l, NULL ); \
AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *side, *m, *n);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
@@ -2010,15 +2050,28 @@ void ctrsm_blis_impl
} // bli_cpuid_is_avx2fma3_supported
#endif
bli_trsmnat
(
blis_side,
&alphao,
&ao,
&bo,
NULL,
NULL
);
//bli_trsmnat
//(
// blis_side,
// &alphao,
// &ao,
// &bo,
// NULL,
// NULL
//);
/* Default to using native execution. */
ind_t im = BLIS_NAT;
/* Obtain a valid context from the gks using the induced
method id determined above. */
cntx_t* cntx = bli_gks_query_ind_cntx( im, dt );
rntm_t rntm_l;
bli_rntm_init_from_global( &rntm_l );
/* Invoke the operation's front-end and request the default control tree. */
PASTEMAC(trsm,_front)( blis_side, &alphao, &ao, &bo, cntx, &rntm_l, NULL ); \
AOCL_DTL_LOG_TRSM_STATS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(c), *side, *m, *n);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)

View File

@@ -69,7 +69,6 @@ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \
void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \
( \
ind_t method, \
num_t dt, \
cntx_t* cntx \
);

View File

@@ -1000,50 +1000,6 @@ BLIS_INLINE bool bli_is_panel_packed( pack_t schema )
( schema & BLIS_PACK_PANEL_BIT );
}
BLIS_INLINE bool bli_is_4mi_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_4MI );
}
BLIS_INLINE bool bli_is_3mi_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3MI );
}
BLIS_INLINE bool bli_is_3ms_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_3MS );
}
BLIS_INLINE bool bli_is_ro_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RO );
}
BLIS_INLINE bool bli_is_io_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_IO );
}
BLIS_INLINE bool bli_is_rpi_packed( pack_t schema )
{
return ( bool )
( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_RPI );
}
BLIS_INLINE bool bli_is_rih_packed( pack_t schema )
{
return ( bool )
( bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) );
}
BLIS_INLINE bool bli_is_1r_packed( pack_t schema )
{
return ( bool )
@@ -1082,20 +1038,6 @@ BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema )
}
// pointer-related
// Increment a pointer by an integer fraction:
// p0 + (num/dem)
// where p0 is a pointer to a datatype of size sizeof_p0.
BLIS_INLINE void_fp bli_ptr_inc_by_frac( void_fp p0, siz_t sizeof_p0, dim_t num, dim_t den )
{
return ( void_fp )
( ( char* )p0 + ( ( num * ( dim_t )sizeof_p0 ) / den ) );
}
// Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix
// argument.

View File

@@ -206,37 +206,6 @@
#include "bli_set0bbs_mxn.h"
// -- 3m-specific scalar macros --
#include "bli_copyri3s.h"
#include "bli_copyjri3s.h"
#include "bli_scal2ri3s.h"
#include "bli_scal2jri3s.h"
#include "bli_scal2ri3s_mxn.h"
// -- 4mh/3mh-specific scalar macros --
// ro
#include "bli_scal2ros.h"
#include "bli_scal2jros.h"
// io
#include "bli_scal2ios.h"
#include "bli_scal2jios.h"
// rpi
#include "bli_scal2rpis.h"
#include "bli_scal2jrpis.h"
#include "bli_scal2rihs_mxn.h"
#include "bli_scal2rihs_mxn_diag.h"
#include "bli_scal2rihs_mxn_uplo.h"
#include "bli_setrihs_mxn_diag.h"
// -- 1m-specific scalar macros --
// 1e

View File

@@ -258,24 +258,10 @@ typedef void (*free_ft) ( void* p );
- 1 0000 01: packed by columns
- 1 0000 10: packed by row panels
- 1 0000 11: packed by column panels
- 1 0001 10: packed by 4m interleaved row panels
- 1 0001 11: packed by 4m interleaved column panels
- 1 0010 10: packed by 3m interleaved row panels
- 1 0010 11: packed by 3m interleaved column panels
- 1 0011 10: packed by 4m separated row panels (not used)
- 1 0011 11: packed by 4m separated column panels (not used)
- 1 0100 10: packed by 3m separated row panels
- 1 0100 11: packed by 3m separated column panels
- 1 0101 10: packed real-only row panels
- 1 0101 11: packed real-only column panels
- 1 0110 10: packed imag-only row panels
- 1 0110 11: packed imag-only column panels
- 1 0111 10: packed real+imag row panels
- 1 0111 11: packed real+imag column panels
- 1 1000 10: packed by 1m expanded row panels
- 1 1000 11: packed by 1m expanded column panels
- 1 1001 10: packed by 1m reordered row panels
- 1 1001 11: packed by 1m reordered column panels
- 1 0001 10: packed by 1m expanded row panels
- 1 0001 11: packed by 1m expanded column panels
- 1 0010 10: packed by 1m reordered row panels
- 1 0010 11: packed by 1m reordered column panels
23 Packed panel order if upper-stored
- 0 == forward order if upper
- 1 == reverse order if upper
@@ -413,34 +399,13 @@ typedef void (*free_ft) ( void* p );
#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT
#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT
#define BLIS_BITVAL_NOT_PACKED 0x0
#define BLIS_BITVAL_4MI ( 0x1 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_3MI ( 0x2 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_4MS ( 0x3 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_3MS ( 0x4 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT )
#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT )
#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT )
#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT )
#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT )
@@ -553,20 +518,6 @@ typedef enum
BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS,
BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS,
BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS,
BLIS_PACKED_ROW_PANELS_4MI = BLIS_BITVAL_PACKED_ROW_PANELS_4MI,
BLIS_PACKED_COL_PANELS_4MI = BLIS_BITVAL_PACKED_COL_PANELS_4MI,
BLIS_PACKED_ROW_PANELS_3MI = BLIS_BITVAL_PACKED_ROW_PANELS_3MI,
BLIS_PACKED_COL_PANELS_3MI = BLIS_BITVAL_PACKED_COL_PANELS_3MI,
BLIS_PACKED_ROW_PANELS_4MS = BLIS_BITVAL_PACKED_ROW_PANELS_4MS,
BLIS_PACKED_COL_PANELS_4MS = BLIS_BITVAL_PACKED_COL_PANELS_4MS,
BLIS_PACKED_ROW_PANELS_3MS = BLIS_BITVAL_PACKED_ROW_PANELS_3MS,
BLIS_PACKED_COL_PANELS_3MS = BLIS_BITVAL_PACKED_COL_PANELS_3MS,
BLIS_PACKED_ROW_PANELS_RO = BLIS_BITVAL_PACKED_ROW_PANELS_RO,
BLIS_PACKED_COL_PANELS_RO = BLIS_BITVAL_PACKED_COL_PANELS_RO,
BLIS_PACKED_ROW_PANELS_IO = BLIS_BITVAL_PACKED_ROW_PANELS_IO,
BLIS_PACKED_COL_PANELS_IO = BLIS_BITVAL_PACKED_COL_PANELS_IO,
BLIS_PACKED_ROW_PANELS_RPI = BLIS_BITVAL_PACKED_ROW_PANELS_RPI,
BLIS_PACKED_COL_PANELS_RPI = BLIS_BITVAL_PACKED_COL_PANELS_RPI,
BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E,
BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E,
BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R,
@@ -574,10 +525,8 @@ typedef enum
} pack_t;
// We combine row and column packing into one "type", and we start
// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. We also count the
// schema pair for "4ms" (4m separated), because its bit value has
// been reserved, even though we don't use it.
#define BLIS_NUM_PACK_SCHEMA_TYPES 10
// with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS.
#define BLIS_NUM_PACK_SCHEMA_TYPES 3
// -- Pack order type --
@@ -670,12 +619,7 @@ typedef enum
typedef enum
{
BLIS_3MH = 0,
BLIS_3M1,
BLIS_4MH,
BLIS_4M1B,
BLIS_4M1A,
BLIS_1M,
BLIS_1M = 0,
BLIS_NAT,
BLIS_IND_FIRST = 0,
BLIS_IND_LAST = BLIS_NAT
@@ -683,13 +627,8 @@ typedef enum
#define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
// These are used in bli_*_oapi.c to construct the ind_t values from
// These are used in bli_l3_*_oapi.c to construct the ind_t values from
// the induced method substrings that go into function names.
#define bli_3mh BLIS_3MH
#define bli_3m1 BLIS_3M1
#define bli_4mh BLIS_4MH
#define bli_4mb BLIS_4M1B
#define bli_4m1 BLIS_4M1A
#define bli_1m BLIS_1M
#define bli_nat BLIS_NAT
@@ -1255,9 +1194,6 @@ typedef struct
inc_t ps_a;
inc_t ps_b;
// The type to convert to on output.
//num_t dt_on_output;
} auxinfo_t;
@@ -1580,9 +1516,6 @@ typedef struct cntx_s
func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
ind_t method;
pack_t schema_a_block;
pack_t schema_b_panel;
pack_t schema_c_panel;
} cntx_t;

Some files were not shown because too many files have changed in this diff Show More