Replaced broken ref99 sandbox w/ simpler version.

Details:
- The 'ref99' sandbox was broken by multiple refactorings and internal
  API changes over the last two years. Rather than try to fix it, I've
  replaced it with a much simpler version based on var2 of gemmsup.
  Why not fix the previous implementation? It occurred to me that the
  old implementation was trying to be a lightly simplified duplication
  of what exists in the framework. Duplication aside, this sandbox
  would have worked fine if it had been completely independent of the
  framework code. The problem was that it was only partially
  independent, with many function calls calling a function in BLIS
  rather than a duplicated/simplified version within the sandbox. (And
  the reason I didn't make it fully independent to begin with was that
  it seemed unnecessarily duplicative at the time.) Maintaining two
  versions of the same implementation is problematic for obvious
  reasons, especially when it wasn't even done properly to begin with.
  This explains the reimplementation in this commit. The only catch is
  that the newer implementation is single-threaded only and does not
  perform any packing on either input matrix (A or B). Basically, it's
  only meant to be a simple placeholder that shows how you could plug
  in your own implementation. Thanks to Francisco Igual for reporting
  this brokenness.
- Updated the three reference gemmsup kernels (defined in
  ref_kernels/3/bli_gemmsup_ref.c) so that they properly handle
  conjugation of conja and/or conjb. The general storage kernel, which
  is currently identical to the column-storage kernel, is used in the
  new ref99 sandbox to provide basic support for all datatypes
  (including scomplex and dcomplex).
- Minor updates to docs/Sandboxes.md, including adding the threading
  and packing limitations to the Caveats section.
- Fixed a comment typo in bli_l3_sup_var1n2m.c (upon which the new
  sandbox implementation is based).
This commit is contained in:
Field G. Van Zee
2020-07-20 19:21:07 -05:00
committed by Dipal M Zambare
parent 004946ed06
commit 1d8d5cd9cf
33 changed files with 948 additions and 110 deletions

View File

@@ -52,10 +52,8 @@ configure: sandbox/ref99
And when you build BLIS, the last files to be compiled will be the source
code in the specified sandbox:
```
Compiling obj/haswell/sandbox/ref99/blx_gemm_front.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/ref99/blx_gemm_int.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/ref99/base/blx_blksz.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/ref99/cntl/blx_gemm_cntl.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/ref99/blx_gemm_ref_var2.o ('haswell' CFLAGS for sandboxes)
Compiling obj/haswell/sandbox/ref99/oapi/bli_gemmnat.o ('haswell' CFLAGS for sandboxes)
...
```
That's it! After the BLIS library is built, it will contain your chosen
@@ -197,6 +195,12 @@ there's no way for it to confirm at runtime that an implementation was written
to support mixing datatypes. Note that even the `ref99` sandbox included with
BLIS does not support mixed-datatype computation.
* **Multithreading in ref99.** The current reference sandbox, `ref99`, does not
currently implement multithreading.
* **Packing matrices in ref99.** The current reference sandbox, `ref99`, does not
currently implement packing of matrices A or B.
## Conclusion
If you encounter any problems, or are really bummed-out that `gemm` is the

View File

@@ -1144,7 +1144,7 @@ void PASTEMAC(ch,varname) \
thread_pb \
); \
\
/* Alias a_use so that it's clear this is our current block of
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\

View File

@@ -60,43 +60,178 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
for ( dim_t j = 0; j < n; ++j ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
else \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by rows. */ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict ci = &c[ i*rs_c ]; \
ctype* restrict ai = &a[ i*rs_a ]; \
\
for ( dim_t j = 0; j < n; ++j ) \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
ctype* restrict cij = &ci[ j*cs_c ]; \
ctype* restrict bj = &b [ j*cs_b ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
@@ -130,43 +265,178 @@ void PASTEMAC3(ch,opname,arch,suf) \
/* NOTE: This microkernel can actually handle arbitrarily large
values of m, n, and k. */ \
\
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \
else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
} \
} \
else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
} \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
else \
} \
} \
else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
{ \
/* Traverse c by columns. */ \
for ( dim_t j = 0; j < n; ++j ) \
{ \
ctype* restrict cj = &c[ j*cs_c ]; \
ctype* restrict bj = &b[ j*cs_b ]; \
\
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
ctype* restrict cij = &cj[ i*rs_c ]; \
ctype* restrict ai = &a [ i*rs_a ]; \
ctype ab; \
\
PASTEMAC(ch,set0s)( ab ); \
\
/* Perform a dot product to update the (i,j) element of c. */ \
for ( dim_t l = 0; l < k; ++l ) \
{ \
ctype* restrict aij = &ai[ l*cs_a ]; \
ctype* restrict bij = &bj[ l*rs_b ]; \
\
PASTEMAC(ch,dots)( *aij, *bij, ab ); \
} \
\
/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
PASTEMAC(ch,conjs)( ab ); \
\
/* If beta is one, add ab into c. If beta is zero, overwrite c
with the result in ab. Otherwise, scale by beta and accumulate
ab to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
} \
else \
{ \
PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
} \
} \
} \
} \

View File

@@ -56,14 +56,19 @@ void bli_gemmnat
{
bli_init_once();
// Obtain a valid native context from the gks if necessary.
// Obtain a valid (native) context from the gks if necessary.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Initialize a local runtime object if necessary.
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); }
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Invoke the operation's front end.
blx_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
//blx_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
blx_gemm_ref_var2( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
BLIS_XXX, cntx, rntm, NULL );
}

View File

@@ -39,7 +39,7 @@
// we #include any headers that would define prototypes or types that are
// needed by the ref99 sandbox source code.
#include "blx_gemm.h"
#include "blx_gemm_ref_var2.h"
#endif

View File

@@ -0,0 +1,361 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "blix.h"
#define FUNCPTR_T gemmsup_fp
typedef void (*FUNCPTR_T)
(
bool_t packa,
bool_t packb,
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
stor3_t eff_id,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- var2 ---------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var2,gemm_ref_var2);
void blx_gemm_ref_var2
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var2[dt];
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
packa,
packb,
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
thread
);
}
else
{
bli_abort();
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
bool_t packa, \
bool_t packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
/* Query the context for various blocksizes. NOTE: We query the
regular blocksizes since the sup blocksizes are not guaranteed
to have default values. */ \
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
const inc_t jrstep_b = cs_b * NR; \
\
const inc_t irstep_c = rs_c * MR; \
const inc_t irstep_a = rs_a * MR; \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of beta and one scalars to prevent any unnecessary
sharing of cache lines between the cores' caches. */ \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n + NC - 1 ) / NC;*/ \
const dim_t jc_left = n % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < n; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= n - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k + KC - 1 ) / KC;*/ \
const dim_t pc_left = k % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < k; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= k - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m + MC - 1 ) / MC;*/ \
const dim_t ic_left = m % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < m; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= m - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += 1 ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc + j * jrstep_b; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Compute number of primary and leftover components of the IR loop. */ \
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
const dim_t ir_left = mc_cur % MR; \
\
/* Loop over the m dimension (MR columns at a time). */ \
for ( dim_t i = 0; i < ir_iter; i += 1 ) \
{ \
const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic + i * irstep_a; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
/*
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
*/ \
\
/* Invoke the kernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mr_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ir, rs_a, cs_a, \
b_jr, rs_b, cs_b, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ref_var2 )

View File

@@ -0,0 +1,73 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void blx_gemm_ref_var2
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
bool_t packa, \
bool_t packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( gemm_ref_var2 )

View File

@@ -38,21 +38,23 @@
cntl_t* blx_gemm_cntl_create
(
opid_t family,
pack_t schema_a,
pack_t schema_b
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b
)
{
return blx_gemmbp_cntl_create( family, schema_a, schema_b );
return blx_gemmbp_cntl_create( rntm, family, schema_a, schema_b );
}
// -----------------------------------------------------------------------------
cntl_t* blx_gemmbp_cntl_create
(
opid_t family,
pack_t schema_a,
pack_t schema_b
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b
)
{
void_fp macro_kernel_fp;
@@ -67,6 +69,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
(
rntm, // the thread's runtime structure
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
@@ -75,6 +78,7 @@ cntl_t* blx_gemmbp_cntl_create
cntl_t* gemm_cntl_bp_bu = blx_gemm_cntl_create_node
(
rntm, // the thread's runtime structure
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_fp,
@@ -84,6 +88,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
(
rntm,
blx_gemm_packa, // pack the left-hand operand
packa_fp,
BLIS_MR,
@@ -99,6 +104,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create a node for partitioning the m dimension by MC.
cntl_t* gemm_cntl_op_bp = blx_gemm_cntl_create_node
(
rntm,
family,
BLIS_MC,
blx_gemm_blk_var1,
@@ -108,6 +114,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create a node for packing matrix B.
cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
(
rntm,
blx_gemm_packb, // pack the right-hand operand
packb_fp,
BLIS_KR,
@@ -123,6 +130,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = blx_gemm_cntl_create_node
(
rntm,
family,
BLIS_KC,
blx_gemm_blk_var3,
@@ -132,6 +140,7 @@ cntl_t* blx_gemmbp_cntl_create
// Create a node for partitioning the n dimension by NC.
cntl_t* gemm_cntl_vl_mm = blx_gemm_cntl_create_node
(
rntm,
family,
BLIS_NC,
blx_gemm_blk_var2,
@@ -145,23 +154,25 @@ cntl_t* blx_gemmbp_cntl_create
void blx_gemm_cntl_free
(
cntl_t* cntl,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_cntl_free( cntl, thread );
bli_cntl_free( rntm, cntl, thread );
}
// -----------------------------------------------------------------------------
cntl_t* blx_gemm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
cntl_t* sub_node
)
{
return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node );
return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
}

View File

@@ -34,25 +34,28 @@
cntl_t* blx_gemm_cntl_create
(
opid_t family,
pack_t schema_a,
pack_t schema_b
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b
);
// -----------------------------------------------------------------------------
cntl_t* blx_gemmbp_cntl_create
(
opid_t family,
pack_t schema_a,
pack_t schema_b
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b
);
// -----------------------------------------------------------------------------
void blx_gemm_cntl_free
(
cntl_t* cntl,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
@@ -60,6 +63,7 @@ void blx_gemm_cntl_free
cntl_t* blx_gemm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,

View File

@@ -39,30 +39,16 @@
void blx_l3_cntl_create_if
(
opid_t family,
pack_t schema_a,
pack_t schema_b,
obj_t* a,
obj_t* b,
obj_t* c,
rntm_t* rntm,
cntl_t* cntl_orig,
cntl_t** cntl_use
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects. Notice that we do this even if the
// caller passed in a custom control tree; that's because we still need
// to reset the pack schema of a and b, which were modified by the
// operation's _front() function. However, in order for this to work,
// the level-3 thread entry function (or omp parallel region) must
// alias thread-local copies of objects a and b.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// If the control tree pointer is NULL, we construct a default
// tree as a function of the operation family.
if ( cntl_orig == NULL )
@@ -74,7 +60,7 @@ void blx_l3_cntl_create_if
// If the user provided a control tree, create a copy and use it
// instead (so that threads can use its local tree as a place to
// cache things like pack mem_t entries).
*cntl_use = bli_cntl_copy( cntl_orig );
*cntl_use = bli_cntl_copy( rntm, cntl_orig );
// Recursively set the family fields of the newly copied control tree
// nodes.
@@ -82,13 +68,10 @@ void blx_l3_cntl_create_if
}
}
void blx_l3_cntl_free_if
void blx_l3_cntl_free
(
obj_t* a,
obj_t* b,
obj_t* c,
cntl_t* cntl_orig,
cntl_t* cntl_use,
rntm_t rntm,
cntl_t* cntl_use,
thrinfo_t* thread
)
{
@@ -96,13 +79,13 @@ void blx_l3_cntl_free_if
// been created, so we now must free it.
if ( cntl_orig == NULL )
{
blx_gemm_cntl_free( cntl_use, thread );
blx_gemm_cntl_free( rntm, cntl_use, thread );
}
else
{
// If the user provided a control tree, free the copy of it that
// was created.
bli_cntl_free( cntl_use, thread );
bli_cntl_free( rntm, cntl_use );
}
}

View File

@@ -35,20 +35,19 @@
void blx_l3_cntl_create_if
(
opid_t family,
pack_t schema_a,
pack_t schema_b,
obj_t* a,
obj_t* b,
obj_t* c,
rntm_t* rntm,
cntl_t* cntl_orig,
cntl_t** cntl_use
);
void blx_l3_cntl_free_if
void blx_l3_cntl_free
(
obj_t* a,
obj_t* b,
obj_t* c,
cntl_t* cntl_orig,
cntl_t* cntl_use,
rntm_t rntm,
cntl_t* cntl_use,
thrinfo_t* thread
);

View File

@@ -36,6 +36,7 @@
cntl_t* blx_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp packm_var_func,
bszid_t bmid_m,

View File

@@ -34,6 +34,7 @@
cntl_t* blx_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp packm_var_func,
bszid_t bmid_m,

View File

@@ -38,6 +38,7 @@
// This code is enabled only when multithreading is enabled via OpenMP.
#ifdef BLIS_ENABLE_OPENMP
#if 0
void blx_gemm_thread
(
gemmint_t func,
@@ -101,6 +102,129 @@ void blx_gemm_thread
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
}
#endif
void blx_gemm_thread
(
gemmint_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
//bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
obj_t a_t, b_t, c_t;
cntl_t* cntl_use;
thrinfo_t* thread;
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// aliases is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );
bli_obj_alias_to( b, &b_t );
bli_obj_alias_to( c, &c_t );
// Create a default control tree for the operation, if needed.
blx_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
blx_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
func
(
alpha,
&a_t,
&b_t,
beta,
&c_t,
cntx,
rntm_p,
cntl_use,
thread
);
// Free the thread's local control tree.
blx_l3_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -35,8 +35,10 @@
// gemm internal function type
typedef void (*gemmint_t)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,