BLAS Extension API - ?gemm_compute()

- Added support for 2 new APIs:
	1. sgemm_compute()
	2. dgemm_compute()
  These are dependent on the ?gemm_pack_get_size() and ?gemm_pack()
  APIs.
- ?gemm_compute() takes the packed matrix buffer (represented by the
  packed matrix identifier) and performs the GEMM operation:
  C := A * B + beta * C.
- Whenever the kernel storage preference and the matrix storage
  scheme isn't matching, and the respective matrix being loaded isn't
  packed either, on-the-go packing has been enabled for such cases to
  pack that matrix.
- Note: If both the matrices are packed using the ?gemm_pack() API,
  it is the responsibility of the user to pack only one matrix with
  alpha scalar and the other with a unit scalar.
- Note: Support is presently limited to Single Thread only. Both, pack
  and compute APIs are forced to take n_threads=1.

AMD-Internal: [CPUPL-3560]
Change-Id: I825d98a0a5038d31668d2a4b84b3ccc204e6c158
This commit is contained in:
Arnav Sharma
2023-07-17 12:44:42 +05:30
committed by Arnav Sharma
parent 81161066e5
commit c8f14edcf5
32 changed files with 3623 additions and 20 deletions

View File

@@ -6,7 +6,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -193,7 +193,8 @@ blis: \
bench_amaxv_blis.x \
bench_copyv_blis.x \
bench_swapv_blis.x \
bench_axpbyv_blis.x
bench_axpbyv_blis.x \
bench_gemm_pack_compute_blis.x
openblas: \
bench_gemm_openblas.x \
@@ -240,7 +241,8 @@ mkl: \
bench_amaxv_mkl.x \
bench_copyv_mkl.x \
bench_swapv_mkl.x \
bench_axpbyv_mkl.x
bench_axpbyv_mkl.x \
bench_gemm_pack_compute_mkl.x
# --Object file rules --

930
bench/bench_gemm_pack_compute.c Executable file
View File

@@ -0,0 +1,930 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// Benchmark application to process aocl logs generated by BLIS library.
#ifndef DT
#define DT BLIS_DOUBLE
#endif
#ifndef IND
#define IND BLIS_NAT
#endif
#ifndef N_REPEAT
//#define N_REPEAT 100
#endif
#define AOCL_MATRIX_INITIALISATION
#define BUFFER_SIZE 256
/* For BLIS since logs are collected at BLAS interfaces
* we disable cblas interfaces for this benchmark application
*/
#ifdef BLIS_ENABLE_CBLAS
// #define CBLAS
#endif
// #define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta, alpha_one;
dim_t m, n, k;
dim_t p_inc = 0; // to keep track of number of inputs
num_t dt;
// ind_t ind;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
double dtime;
double dtime_save;
double gflops;
int packA, packB;
FILE* fin = NULL;
FILE* fout = NULL;
n_repeats = N_REPEAT; // This macro will get from Makefile.
dt = DT;
if (argc < 3)
{
printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n");
exit(1);
}
fin = fopen(argv[1], "r");
if (fin == NULL)
{
printf("Error opening the file %s\n", argv[1]);
exit(1);
}
fout = fopen(argv[2], "w");
if (fout == NULL)
{
printf("Error opening output file %s\n", argv[2]);
exit(1);
}
if (argc > 3)
{
n_repeats = atoi(argv[3]);
}
fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
// Following variables are needed for scanf to read inputs properly
// however they are not used in bench.
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
char dummy_buffer[BUFFER_SIZE];
// Variables extracted from the logs which are used by bench
char stor_scheme, transA_c, transB_c, packA_c, packB_c;
double alpha_r, beta_r, alpha_i, beta_i;
dim_t m_trans, n_trans;
inc_t lda, ldb, ldc;
stor_scheme = 'C'; // By default set it to Column Major
//{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real,
// alpha_imag, lda ldb, beta_real, beta_imag, ldc,
//
// number of threads, execution time, gflops ---> ignored by bench
while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i,
&lda, &ldb, &beta_r, &beta_i, &ldc) == 16)
{
// Discard any extra data on current line in the input file.
fgets(dummy_buffer, BUFFER_SIZE, fin );
// At BLAS level only column major order is supported.
stor_scheme = 'C';
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
else
{
printf("Invalid data type %c\n", dt_ch);
continue;
}
if ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE;
else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE;
else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE;
else
{
printf("Invalid option for transA \n");
continue;
}
if ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE;
else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE;
else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE;
else
{
printf("Invalid option for transB \n");
continue;
}
if ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE;
else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE;
else
{
printf("Invalid option for packA \n");
continue;
}
if ( packB_c == 'p' || packB_c == 'P') packB = TRUE;
else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE;
else
{
printf("Invalid option for packB \n");
continue;
}
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
bli_obj_create( dt, 1, 1, 0, 0, &beta );
bli_obj_create( dt, 1, 1, 0, 0, &alpha_one);
if( (stor_scheme == 'C') || (stor_scheme == 'c') )
{
// leading dimension should be greater than number of rows
// if ((m > lda) || (k > ldb) || (m > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
// if A is transpose - A(lda x m), lda >= max(1,k)
// if A is non-transpose - A (lda x k), lda >= max(1,m)
// if B is transpose - B (ldb x k), ldb >= max(1,n)
// if B is non-transpose - B (ldb x n), ldb >= max(1,k)
// C is ldc x n - ldc >= max(1, m)
//if(transa) lda = k; // We will end up overwriting lda
bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, 1, lda, &a);
//if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n
bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b);
bli_obj_create( dt, m, n, 1, ldc, &c);
bli_obj_create( dt, m, n, 1, ldc, &c_save );
}
else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
{
//leading dimension should be greater than number of columns
//if ((k > lda) || (n > ldb) || (n > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
// if A is transpose - A(k x lda), lda >= max(1,m)
// if A is non-transpose - A (m x lda), lda >= max(1,k)
// if B is transpose - B (n x ldb), ldb >= max(1,k)
// if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
// C is m x ldc - ldc >= max(1, n)
//if(transa) lda = m; // this will overwrite lda
bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, lda, 1, &a);
//if(transb) ldb = k; // this will overwrite ldb
bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans);
bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b);
bli_obj_create( dt, m, n, ldc, 1, &c);
bli_obj_create( dt, m, n, ldc, 1, &c_save );
}
else
{
printf("Invalid storage scheme\n");
continue;
}
#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage.
#ifndef CBLAS
if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) )
{
printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n");
continue;
}
#endif
#endif
#ifdef AOCL_MATRIX_INITIALISATION
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
#endif
bli_copym( &c, &c_save );
bli_obj_set_conjtrans( transa, &a);
bli_obj_set_conjtrans( transb, &b);
bli_setsc( 1.0, 1.0, &alpha_one );
bli_setsc( alpha_r, alpha_i, &alpha );
bli_setsc( beta_r, beta_i, &beta );
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
#ifdef PRINT
bli_printm( "a", &a, "%4.6f", "" );
bli_printm( "b", &b, "%4.6f", "" );
bli_printm( "c", &c, "%4.6f", "" );
#endif
dtime = bli_clock();
#ifdef BLIS
printf( "BLAS Extension APIs don't have a BLIS interface."
"Enable CBLAS or BLAS interface!\n" );
#else
#ifdef CBLAS
enum CBLAS_ORDER cblas_order;
enum CBLAS_TRANSPOSE cblas_transa;
enum CBLAS_TRANSPOSE cblas_transb;
enum CBLAS_IDENTIFIER cblas_identifierA;
enum CBLAS_IDENTIFIER cblas_identifierB;
size_t bufSizeA;
size_t bufSizeB;
if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) )
cblas_order = CblasColMajor;
else
cblas_order = CblasRowMajor;
if( bli_is_trans( transa ) )
cblas_transa = CblasTrans;
else if( bli_is_conjtrans( transa ) )
cblas_transa = CblasConjTrans;
else
cblas_transa = CblasNoTrans;
if( bli_is_trans( transb ) )
cblas_transb = CblasTrans;
else if( bli_is_conjtrans( transb ) )
cblas_transb = CblasConjTrans;
else
cblas_transb = CblasNoTrans;
if ( packA )
cblas_identifierA = CblasAMatrix;
if ( packB )
cblas_identifierB = CblasBMatrix;
#else
f77_char f77_transa;
f77_char f77_transb;
f77_char f77_identifierA;
f77_char f77_identifierB;
f77_int f77_bufSizeA;
f77_int f77_bufSizeB;
f77_char f77_packed = 'P';
f77_identifierA = 'A';
f77_identifierB = 'B';
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
err_t err = BLIS_SUCCESS;
#endif
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
float* alphaonep = bli_obj_buffer( &alpha_one );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
#ifdef CBLAS
float* aBuffer;
float* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
cblas_sgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_sgemm_compute( cblas_order,
CblasPacked,
cblas_transb,
mm,
nn,
kk,
aBuffer, lda,
bp, ldb,
*betap,
cp, ldc );
bli_free_user(aBuffer);
}
else if ( !packA && packB )
{
// Only B is pre-packed.
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
cblas_sgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
cblas_sgemm_compute( cblas_order,
cblas_transa,
CblasPacked,
mm,
nn,
kk,
ap, lda,
bBuffer, ldb,
*betap,
cp, ldc );
bli_free_user(bBuffer);
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
cblas_sgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_sgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphaonep,
bp, ldb,
bBuffer );
cblas_sgemm_compute( cblas_order,
CblasPacked,
CblasPacked,
mm,
nn,
kk,
aBuffer, lda,
bBuffer, ldb,
*betap,
cp, ldc );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is pre-packed.
cblas_sgemm_compute( cblas_order,
cblas_transa,
cblas_transb,
mm,
nn,
kk,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
#else // -- BLAS API --
float* aBuffer;
float* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
sgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
sgemm_compute_( &f77_packed,
&f77_transb,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user( aBuffer );
}
else if ( !packA && packB )
{
// Only B is pre-packed.
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
sgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
bp,
(f77_int*)&ldb,
bBuffer );
sgemm_compute_( &f77_transa,
&f77_packed,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user( bBuffer );
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
sgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
sgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphaonep,
bp,
(f77_int*)&ldb,
bBuffer );
sgemm_compute_( &f77_packed,
&f77_packed,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is reordered.
sgemm_compute_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
}
#endif
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
double* alphap = bli_obj_buffer( &alpha );
double* alphaonep = bli_obj_buffer( &alpha_one );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
#ifdef CBLAS
double* aBuffer;
double* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
cblas_dgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_dgemm_compute( cblas_order,
CblasPacked,
cblas_transb,
mm,
nn,
kk,
aBuffer, lda,
bp, ldb,
*betap,
cp, ldc );
bli_free_user(aBuffer);
}
else if ( !packA && packB )
{
// Only B is pre-packed.
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
cblas_dgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
cblas_dgemm_compute( cblas_order,
cblas_transa,
CblasPacked,
mm,
nn,
kk,
ap, lda,
bBuffer, ldb,
*betap,
cp, ldc );
bli_free_user(bBuffer);
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
mm,
nn,
kk );
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
mm,
nn,
kk );
bBuffer = (double*) bli_malloc_user( bufSizeB, &err );
cblas_dgemm_pack( cblas_order,
CblasAMatrix,
cblas_transa,
mm,
nn,
kk,
*alphap,
ap, lda,
aBuffer );
cblas_dgemm_pack( cblas_order,
CblasBMatrix,
cblas_transb,
mm,
nn,
kk,
*alphap,
bp, ldb,
bBuffer );
cblas_dgemm_compute( cblas_order,
CblasPacked,
CblasPacked,
mm,
nn,
kk,
aBuffer, lda,
bBuffer, ldb,
*betap,
cp, ldc );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is pre-packed.
cblas_dgemm_compute( cblas_order,
cblas_transa,
cblas_transb,
mm,
nn,
kk,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
#else // -- BLAS API --
double* aBuffer;
double* bBuffer;
if ( packA && !packB )
{
// Only A is pre-packed.
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
dgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
dgemm_compute_( &f77_packed,
&f77_transb,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user( aBuffer );
}
else if ( !packA && packB )
{
// Only B is pre-packed.
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
dgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
bp,
(f77_int*)&ldb,
bBuffer );
dgemm_compute_( &f77_transa,
&f77_packed,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user( bBuffer );
}
else if ( packA && packB )
{
// Both A & B are pre-packed.
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
&mm,
&nn,
&kk );
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
&mm,
&nn,
&kk );
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
dgemm_pack_( &f77_identifierA,
&f77_transa,
&mm,
&nn,
&kk,
alphap,
ap,
(f77_int*)&lda,
aBuffer );
dgemm_pack_( &f77_identifierB,
&f77_transb,
&mm,
&nn,
&kk,
alphaonep,
bp,
(f77_int*)&ldb,
bBuffer );
dgemm_compute_( &f77_packed,
&f77_packed,
&mm,
&nn,
&kk,
aBuffer, (f77_int*)&lda,
bBuffer, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
bli_free_user(aBuffer);
bli_free_user(bBuffer);
}
else
{
// Neither A nor B is reordered.
dgemm_compute_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
ap, (f77_int*)&lda,
bp, (f77_int*)&ldb,
betap,
cp, (f77_int*)&ldc );
}
#endif
}
#endif
#ifdef PRINT
bli_printm( "c compute", &c, "%4.6f", "" );
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%cgemm_%s", dt_ch, BLAS );
p_inc++;
printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
(unsigned long)(p_inc),
(unsigned long)m,
(unsigned long)n,
(unsigned long)k, gflops);
fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
fflush(fout);
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
fclose(fin);
fclose(fout);
return 0;
}

View File

@@ -0,0 +1,92 @@
sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1
sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2
sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3
sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4
sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5
sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6
sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7
sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8
sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9
sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10
sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20
sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30
sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40
sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50
sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60
sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70
sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80
sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90
sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100
sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200
sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300
sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400
sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500
dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1
dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2
dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3
dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4
dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5
dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6
dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7
dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8
dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9
dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10
dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20
dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30
dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40
dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50
dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60
dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70
dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80
dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90
dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100
dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200
dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300
dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400
dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500
sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1
sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2
sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3
sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4
sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5
sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6
sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7
sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8
sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9
sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10
sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20
sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30
sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40
sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50
sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60
sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70
sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80
sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90
sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100
sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200
sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300
sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400
sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500
dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1
dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2
dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3
dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4
dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5
dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6
dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7
dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8
dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9
dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10
dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20
dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30
dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40
dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50
dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60
dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70
dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80
dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90
dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100
dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200
dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300
dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400
dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500

View File

@@ -1,4 +1,4 @@
##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.##
target_sources("${PROJECT_NAME}"
PRIVATE
@@ -26,12 +26,13 @@ target_sources("${PROJECT_NAME}"
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_oapi.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_tapi.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_smart_threading.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute.c
)
# Select AMD specific sources for AMD configurations.
if(${TARGET_ARCH} STREQUAL zen OR
${TARGET_ARCH} STREQUAL zen2 OR
if(${TARGET_ARCH} STREQUAL zen OR
${TARGET_ARCH} STREQUAL zen2 OR
${TARGET_ARCH} STREQUAL zen3 OR
${TARGET_ARCH} STREQUAL zen4 OR
${TARGET_ARCH} STREQUAL zen4 OR
${TARGET_ARCH} STREQUAL amdzen)
target_sources("${PROJECT_NAME}"
PRIVATE

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-22, Advanced Micro Devices, Inc.
Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -105,3 +105,6 @@
// Smart Threading API's.
#include "bli_l3_smart_threading.h"
// BLAS Extension API - Compute
#include "bli_l3_compute.h"

637
frame/3/bli_l3_compute.c Normal file
View File

@@ -0,0 +1,637 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_compute_init
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
if ( bli_error_checking_is_enabled() )
{
// @todo: Add call to error checking function here
}
// Initializing the cntx if one isn't already passed.
if ( cntx == NULL ) {
cntx = bli_gks_query_cntx();
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL )
{
bli_rntm_init_from_global( &rntm_l );
rntm = &rntm_l;
}
else
{
rntm_l = *rntm;
rntm = &rntm_l;
}
// @todo: AOCL Dynamic yet to be implemented for pack-compute APIs.
#ifdef AOCL_DYNAMIC
// If dynamic-threading is enabled, calculate optimum number
// of threads.
// rntm will be updated with optimum number of threads.
// bli_nthreads_optimum(a, b, c, BLIS_GEMM, rntm );
#endif
// Explicitly set n_threads=1 and update rntm since only ST supported.
dim_t n_threads = 1;
bli_rntm_set_num_threads( n_threads, rntm );
bli_rntm_set_ways_from_rntm_sup
(
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
bli_l3_compute_thread_decorator
(
bli_gemm_compute,
BLIS_GEMM,
a,
b,
beta,
c,
cntx,
rntm
);
}
err_t bli_gemm_compute
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
// packedX defines whether matrix X is pre-packed (reordered) or not.
bool packeda = bli_obj_is_packed( a );
bool packedb = bli_obj_is_packed( b );
// packX defines whether to pack matrix X on-the-go or not.
bool packa = bli_rntm_pack_a( rntm );
bool packb = bli_rntm_pack_b( rntm );
const bool transa = bli_obj_has_trans( a );
const bool transb = bli_obj_has_trans( b );
// is_col_stored_a = TRUE when,
// A is col stored and not transposed,
// or, A is row stored and transposed.
const bool is_col_stored_a = bli_obj_is_col_stored( a ) && !transa;
// is_row_stored_b = TRUE when,
// B is row stored and not transposed,
// or, B is col stored and transposed.
const bool is_row_stored_b = bli_obj_is_row_stored( b ) && !transb;
// If kernel is row-preferred but B is not row-stored and unpacked,
// enable on-the-go packing of B.
// Else if kernel is col-preferred but A is not col-stored and unpacked,
// enable on-the-go packing of A.
if ( row_pref )
{
if ( !packedb && !is_row_stored_b ) packb = TRUE;
}
else // if ( col_pref )
{
if ( !packeda && !is_col_stored_a ) packa = TRUE;
}
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
// Setting the packing status in rntm.
if ( packa ) bli_rntm_set_pack_a( 1, rntm );
else bli_rntm_set_pack_a( 0, rntm );
if ( packb ) bli_rntm_set_pack_b( 1, rntm );
else bli_rntm_set_pack_b( 0, rntm );
if ( bli_is_float( dt ) )
{
PASTEMAC( s, gemm_compute )
(
packa,
packb,
packeda,
packedb,
m,
n,
k,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels.
cntx,
rntm,
thread
);
}
else if ( bli_is_double( dt ) )
{
PASTEMAC( d, gemm_compute )
(
packa,
packb,
packeda,
packedb,
m,
n,
k,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels.
cntx,
rntm,
thread
);
}
return BLIS_SUCCESS;
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC( ch, varname ) \
( \
bool packa, \
bool packb, \
bool packeda, \
bool packedb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC( ch, type ); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* @todo Add early return for k < 1 or alpha = 0 here. */ \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
/* @note: Modifications of KC are just a part of optimizations.
Such optimizations have been removed for simplicity and will be a part
of the optimizations patch. */ \
dim_t KC; \
KC = KC0; \
\
/* Query the maximum blocksize for NR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
const dim_t NRE = NRM - NR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t jcstep_b_use = k; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t pcstep_a_use = ( ( m + MR - 1 ) / MR ) * MR; \
\
const inc_t jrstep_c = cs_c * NR; \
\
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of beta and one scalars to prevent any unnecessary
sharing of cache lines between the cores' caches. */ \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
\
auxinfo_t aux; \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t* restrict bszids; \
\
/* Set the bszids pointer to the correct bszids array above based on which
matrices (if any) are being packed. */ \
\
if ( packa ) { if ( packb ) bszids = bszids_packab; \
else bszids = bszids_packa; } \
else { if ( packb ) bszids = bszids_packb; \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jc = bszids; \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict b_jc_use = b_00 + jj * jcstep_b_use; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_pc = &bszids_jc[1]; \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
const inc_t icstep_a_use = kc_cur; \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
ctype* restrict b_pc_use; \
ctype* restrict a_pc_use = a_00 + pp * pcstep_a_use; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing B, we alias to
the _pc variables so that code further down can unconditionally
reference the _pb variables. Note that *if* we will be packing
B, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pb; \
if ( packb ) { bszids_pb = &bszids_pc[1]; \
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
else { bszids_pb = &bszids_pc[0]; \
thread_pb = thread_pc; } \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then a_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
\
/* packedb == TRUE indicates that B is reordered thus, update the
necessary pointers.
Else, call packm routine to pack B on-the-go. */ \
if ( packedb ) \
{ \
rs_b_use = NR; \
cs_b_use = 1; \
ps_b_use = kc_cur * NR; \
b_pc_use = b_jc_use + pp * pcstep_b_use; \
} else \
{ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
BLIS_BUFFER_FOR_B_PANEL, \
stor_id, \
BLIS_NO_TRANSPOSE, \
KC, NC, \
kc_cur, nc_cur, NR, \
&one_local, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
b_pc_use = b_use; \
} \
\
/* We don't need to embed the panel stride of B within the auxinfo_t
object because this variant iterates through B in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
ctype* restrict a_ic_use; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing A, we alias to
the _ic variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pa; \
if ( packa ) { bszids_pa = &bszids_ic[1]; \
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
else { bszids_pa = &bszids_ic[0]; \
thread_pa = thread_ic; } \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
/* packedb == TRUE indicates that B is reordered thus, update the
necessary pointers.
Else, call packm routine to pack B on-the-go. */ \
if ( packeda ) \
{ \
rs_a_use = 1; \
cs_a_use = MR; \
ps_a_use = MR * kc_cur; \
a_ic_use = a_pc_use + ii * icstep_a_use; \
} \
else \
{ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
stor_id, /* a "block of A." */ \
BLIS_NO_TRANSPOSE, \
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
mc_cur, kc_cur, MR, \
&one_local, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
a_ic_use = a_use; \
} \
\
/* Embed the panel stride of A within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of A (if needed). */ \
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* An optimization: allow the last jr iteration to contain up to NRE
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. NOTE: We forgo this optimization when packing B
since packing an extended edge case is not yet supported. */ \
if ( !packb && !is_mt ) \
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
{ \
jr_iter--; jr_left += NR; \
} \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
mc_cur, \
nr_cur, \
kc_cur, \
&one_local, \
a_ic_use, rs_a_use, cs_a_use, \
b_jr, rs_b_use, cs_b_use, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* NOTE: This barrier is only needed if we are packing B (since
that matrix is packed within the pc loop of this variant). */ \
if ( packb ) bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTEMAC(ch,packm_sup_finalize_mem_a) \
( \
packa, \
rntm, \
&mem_a, \
thread_pa \
); \
PASTEMAC(ch,packm_sup_finalize_mem_b) \
( \
packb, \
rntm, \
&mem_b, \
thread_pb \
); \
}
INSERT_GENTFUNC_BASIC0_SD( gemm_compute )

80
frame/3/bli_l3_compute.h Normal file
View File

@@ -0,0 +1,80 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemm_compute_init
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
err_t bli_gemm_compute
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Prototype BLAS-like interfaces with void pointer operands.
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC( ch, varname ) \
( \
bool packa, \
bool packb, \
bool packeda, \
bool packedb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( gemm_compute )

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -84,6 +84,7 @@ BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_t
if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
else if ( trans == 'p' || trans == 'P' ) *blis_trans = BLIS_PACKED;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.

View File

@@ -30,11 +30,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_imatcopy.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy2.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_omatadd.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute.c
)
# Select AMD specific sources for AMD configurations.
if(${TARGET_ARCH} STREQUAL zen OR
${TARGET_ARCH} STREQUAL zen2 OR
if(${TARGET_ARCH} STREQUAL zen OR
${TARGET_ARCH} STREQUAL zen2 OR
${TARGET_ARCH} STREQUAL zen3 OR
${TARGET_ARCH} STREQUAL zen4 OR
${TARGET_ARCH} STREQUAL amdzen)
@@ -49,8 +52,6 @@ ${TARGET_ARCH} STREQUAL amdzen)
${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
)
else()
target_sources("${PROJECT_NAME}"

View File

@@ -0,0 +1,285 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// BLAS Extension APIs
/* ?gemm_compute.h */
/* BLAS interface to compute matrix-matrix product */
/* Datatype : s & d (single and double precision only supported) */
/* BLAS Extensions */
/* output is the gemm result */
#include "blis.h"
void sgemm_compute_blis_impl
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const float* a, const f77_int* rs_a, const f77_int* cs_a,
const float* b, const f77_int* rs_b, const f77_int* cs_b,
const float* beta,
float* c, const f77_int* rs_c, const f77_int* cs_c
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
/* Initialize BLIS. */
bli_init_auto();
// @todo: Add AOCL DTL logs
// AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k,
// (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm_compute)
(
MKSTR(s),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
( ( *rs_a != 1 ) ? rs_a : cs_a ),
( ( *rs_b != 1 ) ? rs_b : cs_b ),
rs_c, cs_c
);
/* Quick return if possible. */
if ( *m == 0 || *n == 0 )
{
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1(*m, m0);
bli_convert_blas_dim1(*n, n0);
bli_convert_blas_dim1(*k, k0);
const num_t dt = BLIS_FLOAT;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
bli_obj_init_finish_1x1( dt, (float*)beta, &betao );
bli_obj_init_finish( dt, m0_a, n0_a, (float*)a, *rs_a, *cs_a, &ao );
bli_obj_init_finish( dt, m0_b, n0_b, (float*)b, *rs_b, *cs_b, &bo );
bli_obj_init_finish( dt, m0, n0, (float*)c, *rs_c, *cs_c, &co );
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
PASTEMAC0( gemm_compute_init )
(
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
#ifdef BLIS_ENABLE_BLAS
void sgemm_compute_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const float* a, const f77_int* lda,
const float* b, const f77_int* ldb,
const float* beta,
float* c, const f77_int* ldc
)
{
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int rs_c = 1;
sgemm_compute_blis_impl( transa,
transb,
m,
n,
k,
a, &rs_a, lda,
b, &rs_b, ldb,
beta,
c, &rs_c, ldc );
}
#endif
void dgemm_compute_blis_impl
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const double* a, const f77_int* rs_a, const f77_int* cs_a,
const double* b, const f77_int* rs_b, const f77_int* cs_b,
const double* beta,
double* c, const f77_int* rs_c, const f77_int* cs_c
)
{
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
dim_t m0_a, n0_a;
dim_t m0_b, n0_b;
/* Initialize BLIS. */
bli_init_auto();
// @todo: Add AOCL DTL logs
// AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k,
// (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm_compute)
(
MKSTR(d),
MKSTR(gemm),
transa,
transb,
m,
n,
k,
( ( *rs_a != 1 ) ? rs_a : cs_a ),
( ( *rs_b != 1 ) ? rs_b : cs_b ),
rs_c, cs_c
);
/* Quick return if possible. */
if ( *m == 0 || *n == 0 )
{
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
/* Typecast BLAS integers to BLIS integers. */
bli_convert_blas_dim1(*m, m0);
bli_convert_blas_dim1(*n, n0);
bli_convert_blas_dim1(*k, k0);
const num_t dt = BLIS_DOUBLE;
obj_t ao = BLIS_OBJECT_INITIALIZER;
obj_t bo = BLIS_OBJECT_INITIALIZER;
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
obj_t co = BLIS_OBJECT_INITIALIZER;
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
bli_obj_init_finish_1x1( dt, (double*)beta, &betao );
bli_obj_init_finish( dt, m0_a, n0_a, (double*)a, *rs_a, *cs_a, &ao );
bli_obj_init_finish( dt, m0_b, n0_b, (double*)b, *rs_b, *cs_b, &bo );
bli_obj_init_finish( dt, m0, n0, (double*)c, *rs_c, *cs_c, &co );
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
PASTEMAC0( gemm_compute_init )
(
&ao,
&bo,
&betao,
&co,
NULL,
NULL
);
/* Finalize BLIS. */
bli_finalize_auto();
}
#ifdef BLIS_ENABLE_BLAS
BLIS_EXPORT_BLAS void dgemm_compute_
(
const f77_char* transa,
const f77_char* transb,
const f77_int* m,
const f77_int* n,
const f77_int* k,
const double* a, const f77_int* lda,
const double* b, const f77_int* ldb,
const double* beta,
double* c, const f77_int* ldc
)
{
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int rs_c = 1;
dgemm_compute_blis_impl( transa,
transb,
m,
n,
k,
a, &rs_a, lda,
b, &rs_b, ldb,
beta,
c, &rs_c, ldc );
}
#endif

View File

@@ -0,0 +1,72 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// BLAS Extension APIs
/* ?gemm_compute.h */
/* BLAS interface to compute matrix-matrix product */
/* Datatype : s & d (single and double precision only supported) */
/* BLAS Extensions */
/* output is the gemm result */
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
IF_BLIS_ENABLE_BLAS(\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
); \
)\
BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* a, const f77_int* rs_a, const f77_int* cs_a, \
const ftype* b, const f77_int* rs_b, const f77_int* cs_b, \
const ftype* beta, \
ftype* c, const f77_int* rs_c, const f77_int* cs_c \
);
INSERT_GENTPROTRO_BLAS( gemm_compute )

View File

@@ -183,6 +183,7 @@
#include "bla_trmm.h"
#include "bla_trsm.h"
#include "bla_gemmt.h"
#include "bla_gemm_compute.h"
#include "bla_gemm_check.h"
#include "bla_hemm_check.h"
@@ -194,6 +195,7 @@
#include "bla_trmm_check.h"
#include "bla_trsm_check.h"
#include "bla_gemmt_check.h"
#include "bla_gemm_compute_check.h"
// -- Batch Extension prototypes --
#include "bla_gemm_batch.h"

View File

@@ -48,6 +48,8 @@ enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
enum CBLAS_STORAGE {CblasPacked=151};
enum CBLAS_IDENTIFIER {CblasAMatrix=161, CblasBMatrix=162};
#ifdef __cplusplus
extern "C" {
@@ -993,6 +995,190 @@ BLIS_EXPORT_BLAS f77_int cblas_idamin(f77_int N, const double *X, f77_int incX);
BLIS_EXPORT_BLAS f77_int cblas_icamin(f77_int N, const void *X, f77_int incX);
BLIS_EXPORT_BLAS f77_int cblas_izamin(f77_int N, const void *X, f77_int incX);
// -- PACK COMPUTE APIs --
/** \addtogroup INTERFACE CBLAS INTERFACE
* @{
*/
/**
* cblas_sgemm_pack_get_size calculates and returns the number of bytes necessary
* to store the specified matrix after packing.
*
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @return The size in bytes required to store the specified matrix after packing.
*/
BLIS_EXPORT_BLAS f77_int cblas_sgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
const f77_int M, const f77_int N, const f77_int K);
/**
* cblas_dgemm_pack_get_size calculates and returns the number of bytes necessary
* to store the specified matrix after packing.
*
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @return The size in bytes required to store the specified matrix after packing.
*/
BLIS_EXPORT_BLAS f77_int cblas_dgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
const f77_int M, const f77_int N, const f77_int K);
/**
* cblas_sgemm_pack scales by alpha and packs the specified matrix into the
* allocated buffer. It is imperative to allocate a buffer of type float and size
* as returned by the cblas_sgemm_pack_get_size() before invoking this routine.
*
* @note If both the matrices are to be packed, the user must ensure that only
* one matrix is packed with the scalar alpha and the other with a unit-scalar.
*
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
* if trans = CblasNoTrans, then Mat(X) = X;
* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @param[in] alpha Specifies the scalar alpha.
* @param[in] src The matrix to be packed.
* @param[in] ld Specifies the leading dimension of the matrix to be packed.
* @param[out] dest The buffer to store the scaled and packed matrix.
* @return None
*/
BLIS_EXPORT_BLAS void cblas_sgemm_pack(enum CBLAS_ORDER Order,
enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
const f77_int M, const f77_int N, const f77_int K,
const float alpha, const float *src, const f77_int ld,
float* dest );
/**
* cblas_dgemm_pack scales by alpha and packs the specified matrix into the
* allocated buffer. It is imperative to allocate a buffer of type double and
* size as returned by the cblas_dgemm_pack_get_size() before invoking this
* routine.
*
* @note If both the matrices are to be packed, the user must ensure that only
* one matrix is packed with the scalar alpha and the other with a unit-scalar.
*
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
* if trans = CblasNoTrans, then Mat(X) = X;
* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @param[in] alpha Specifies the scalar alpha.
* @param[in] src The matrix to be packed.
* @param[in] ld Specifies the leading dimension of the matrix to be packed.
* @param[out] dest The buffer to store the scaled and packed matrix.
* @return None
*/
BLIS_EXPORT_BLAS void cblas_dgemm_pack(enum CBLAS_ORDER Order,
enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
const f77_int M, const f77_int N, const f77_int K,
const double alpha, const double *src, const f77_int ld,
double* dest );
/**
* cblas_sgemm_compute computes the matrix-matrix product where one or both the
* input matrices are packed and adds this to the scalar-matrix product. This
* operation is defined as:
* C := Mat(A) * Mat(B) + beta*C,
* where,
* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
* beta is a scalar,
* A, B and C are matrices:
* Mat(A) is an nxk matrix, or a packed matrix buffer,
* Mat(B) is a kxn matrix, or a packed matrix buffer,
* C is an mxn matrix.
*
* @note In case both the matrices are to be packed, the user must ensure that
* only one matrix is packed with alpha scalar and the other with a unit-scalar,
* during the packing process
*
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
* if transa = CblasNoTrans, then Mat(A) = A;
* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
* if transa = CblasPacked, then A matrix is packed and lda is ignored.
* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
* if transb = CblasNoTrans, then Mat(B) = B;
* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @param[in] A The array is float matrix A or a buffer with packed matrix A.
* @param[in] lda Specifies the leading dimension of A.
* @param[in] B The array is float matrix B or a buffer with packed matrix B.
* @param[in] ldb Specifies the leading dimension of B.
* @param[in] beta Specifies the scalar beta.
* @param[in,out] C The array is float matrix C.
* @param[in] ldc Specifies the leading dimension of C.
* @return None
*/
BLIS_EXPORT_BLAS void cblas_sgemm_compute(enum CBLAS_ORDER Order,
f77_int TransA, f77_int TransB,
const f77_int M, const f77_int N, const f77_int K,
const float* A, f77_int lda, const float* B, f77_int ldb,
float beta, float* C, f77_int ldc);
/**
* cblas_dgemm_compute computes the matrix-matrix product where one or both the
* input matrices are packed and adds this to the scalar-matrix product. This
* operation is defined as:
* C := Mat(A) * Mat(B) + beta*C,
* where,
* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
* beta is a scalar,
* A, B and C are matrices:
* Mat(A) is an nxk matrix, or a packed matrix buffer,
* Mat(B) is a kxn matrix, or a packed matrix buffer,
* C is an mxn matrix.
*
* @note In case both the matrices are to be packed, the user must ensure that
* only one matrix is packed with alpha scalar and the other with a unit-scalar,
* during the packing process
*
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
* if transa = CblasNoTrans, then Mat(A) = A;
* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
* if transa = CblasPacked, then A matrix is packed and lda is ignored.
* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
* if transb = CblasNoTrans, then Mat(B) = B;
* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
* @param[in] N Specifies the order of the matrix C.
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
* @param[in] A The array is double matrix A or a buffer with packed matrix A.
* @param[in] lda Specifies the leading dimension of A.
* @param[in] B The array is double matrix B or a buffer with packed matrix B.
* @param[in] ldb Specifies the leading dimension of B.
* @param[in] beta Specifies the scalar beta.
* @param[in,out] C The array is double matrix C.
* @param[in] ldc Specifies the leading dimension of C.
* @return None
*/
BLIS_EXPORT_BLAS void cblas_dgemm_compute(enum CBLAS_ORDER Order,
f77_int TransA, f77_int TransB,
const f77_int M, const f77_int N, const f77_int K,
const double* A, f77_int lda, const double* B, f77_int ldb,
double beta, double* C, f77_int ldc);
/** @}*/
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1,172 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
BLIS_EXPORT_BLAS void cblas_dgemm_compute( enum CBLAS_ORDER Order,
f77_int TransA,
f77_int TransB,
const f77_int M, const f77_int N,
const f77_int K,
const double* A, f77_int lda,
const double* B, f77_int ldb,
double beta,
double* C, f77_int ldc )
{
char TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_TA, F77_TB;
#else
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if ( Order == CblasColMajor ) // CblasColMajor
{
if ( TransA == CblasTrans ) TA='T';
else if ( TransA == CblasConjTrans ) TA='T';
else if ( TransA == CblasNoTrans ) TA='N';
else if ( TransA == CblasPacked ) TA='P';
else
{
cblas_xerbla(2, "cblas_dgemm_compute",
"Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( TransB == CblasTrans ) TB='T';
else if ( TransB == CblasConjTrans ) TB='T';
else if ( TransB == CblasNoTrans ) TB='N';
else if ( TransB == CblasPacked ) TB='P';
else
{
cblas_xerbla(3, "cblas_dgemm_compute",
"Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int rs_c = 1;
F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
}
else if ( Order == CblasRowMajor ) // CblasRowMajor
{
RowMajorStrg = 1;
// If Row Major, and A is not already reordered
// then toggle the transA parameter and interchange the strides.
if ( TransA == CblasPacked ) TA='P';
else if ( TransA == CblasTrans ) TA='N';
else if ( TransA == CblasNoTrans ) TA='T';
else if ( TransA == CblasConjTrans ) TA='N';
else
{
cblas_xerbla(2, "cblas_dgemm_compute",
"Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
// If Row Major, and B is not already reordered
// then toggle the transB parameter and interchange the strides.
if ( TransB == CblasPacked ) TB='P';
else if ( TransB == CblasTrans ) TB='N';
else if ( TransB == CblasNoTrans ) TB='T';
else if ( TransB == CblasConjTrans ) TB='N';
else
{
cblas_xerbla(2, "cblas_dgemm_compute",
"Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int cs_c = 1;
F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c );
}
else
{
cblas_xerbla(1, "cblas_dgemm_compute",
"Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
return;
}
#endif

View File

@@ -0,0 +1,157 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
BLIS_EXPORT_BLAS void cblas_dgemm_pack( enum CBLAS_ORDER Order,
enum CBLAS_IDENTIFIER Identifier,
enum CBLAS_TRANSPOSE Trans,
const f77_int M,
const f77_int N,
const f77_int K,
const double alpha,
const double* src, const f77_int ld,
double* dest )
{
char TR;
char ID;
#ifdef F77_CHAR
F77_CHAR F77_TR;
F77_CHAR F77_ID;
#else
#define F77_TR &TR
#define F77_ID &ID
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#define F77_ld ld
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if ( Order == CblasColMajor ) // CblasColMajor
{
if ( Trans == CblasNoTrans ) TR = 'N';
else if ( Trans == CblasTrans ) TR = 'T';
else if ( Trans == CblasConjTrans ) TR = 'T';
else
{
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Trans setting, %d\n", Trans);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( Identifier == CblasAMatrix ) ID = 'A';
else if ( Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TR = C2F_CHAR(&TR);
F77_ID = C2F_CHAR(&ID);
#endif
F77_dgemm_pack( F77_ID,
F77_TR,
&F77_M,
&F77_N,
&F77_K,
&alpha,
src, &F77_ld,
dest );
}
else if ( Order == CblasRowMajor ) // CblasRowMajor
{
RowMajorStrg = 1;
if ( Trans == CblasNoTrans ) TR = 'T';
else if ( Trans == CblasTrans ) TR = 'N';
else if ( Trans == CblasConjTrans ) TR = 'N';
else
{
cblas_xerbla(3, "cblas_dgemm_pack","Invalid Trans setting, %d\n", Trans);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( Identifier == CblasAMatrix ) ID = 'A';
else if ( Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TR = C2F_CHAR(&TR);
F77_ID = C2F_CHAR(&ID);
#endif
F77_dgemm_pack ( F77_ID,
F77_TR,
&F77_M,
&F77_N,
&F77_K,
&alpha,
src, &F77_ld,
dest );
}
else cblas_xerbla(1, "cblas_dgemm_pack", "Invalid Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -0,0 +1,83 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
f77_int cblas_dgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier,
const f77_int M,
const f77_int N,
const f77_int K )
{
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
char ID;
f77_int tbytes = 0;
#ifdef F77_CHAR
F77_CHAR F77_ID;
#else
#define F77_ID &ID
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#endif
if (Identifier == CblasAMatrix ) ID = 'A';
else if (Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla( 1, "cblas_dgemm_pack_get_size",
"Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
return 0;
}
#ifdef F77_CHAR
F77_ID = C2F_CHAR( &ID );
#endif
tbytes = F77_dgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
return tbytes;
}
#endif

View File

@@ -202,6 +202,14 @@
#define F77_cgemm_batch cgemm_batch
#define F77_zgemm_batch zgemm_batch
// -- Pack-Compute APIs --
#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl
#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl
#define F77_sgemm_pack sgemm_pack_blis_impl
#define F77_dgemm_pack dgemm_pack_blis_impl
#define F77_sgemm_compute sgemm_compute_blis_impl
#define F77_dgemm_compute dgemm_compute_blis_impl
// (BLIS_ENABLE_NO_UNDERSCORE_API) ends
#else
/*
@@ -389,6 +397,14 @@
#define F77_dgemm_batch dgemm_batch_
#define F77_cgemm_batch cgemm_batch_
#define F77_zgemm_batch zgemm_batch_
// -- Pack-Compute APIs --
#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl
#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl
#define F77_sgemm_pack sgemm_pack_blis_impl
#define F77_dgemm_pack dgemm_pack_blis_impl
#define F77_sgemm_compute sgemm_compute_blis_impl
#define F77_dgemm_compute dgemm_compute_blis_impl
#endif
#endif /* CBLAS_F77_H */

View File

@@ -0,0 +1,171 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
BLIS_EXPORT_BLAS void cblas_sgemm_compute( enum CBLAS_ORDER Order,
f77_int TransA,
f77_int TransB,
const f77_int M,
const f77_int N,
const f77_int K,
const float* A, f77_int lda,
const float* B, f77_int ldb,
float beta,
float* C, f77_int ldc)
{
char TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_TA, F77_TB;
#else
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if( Order == CblasColMajor ) // CblasColMajor
{
if ( TransA == CblasTrans ) TA='T';
else if ( TransA == CblasConjTrans ) TA='T';
else if ( TransA == CblasNoTrans ) TA='N';
else if ( TransA == CblasPacked ) TA='P';
else
{
cblas_xerbla(2, "cblas_sgemm_compute",
"Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( TransB == CblasTrans ) TB='T';
else if ( TransB == CblasConjTrans ) TB='T';
else if ( TransB == CblasNoTrans ) TB='N';
else if ( TransB == CblasPacked ) TB='P';
else
{
cblas_xerbla(3, "cblas_sgemm_compute",
"Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int rs_c = 1;
F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
}
else if ( Order == CblasRowMajor ) // CblasRowMajor
{
RowMajorStrg = 1;
// If Row Major, and A is not already reordered
// then toggle the transA parameter and interchange the strides.
if ( TransA == CblasPacked ) TA='P';
else if ( TransA == CblasTrans ) TA='N';
else if ( TransA == CblasNoTrans ) TA='T';
else if ( TransA == CblasConjTrans ) TA='N';
else
{
cblas_xerbla(2, "cblas_sgemm_compute",
"Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
// If Row Major, and B is not already reordered
// then toggle the transB parameter and interchange the strides.
if ( TransB == CblasPacked ) TB='P';
else if ( TransB == CblasTrans ) TB='N';
else if ( TransB == CblasNoTrans ) TB='T';
else if ( TransB == CblasConjTrans ) TB='N';
else
{
cblas_xerbla(2, "cblas_sgemm_compute",
"Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
f77_int rs_a = 1;
f77_int rs_b = 1;
f77_int cs_c = 1;
F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c);
}
else
{
cblas_xerbla(1, "cblas_sgemm_compute",
"Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
return;
}
#endif

View File

@@ -0,0 +1,157 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
BLIS_EXPORT_BLAS void cblas_sgemm_pack( enum CBLAS_ORDER Order,
enum CBLAS_IDENTIFIER Identifier,
enum CBLAS_TRANSPOSE Trans,
const f77_int M,
const f77_int N,
const f77_int K,
const float alpha,
const float* src, const f77_int ld,
float* dest )
{
char TR;
char ID;
#ifdef F77_CHAR
F77_CHAR F77_TR;
F77_CHAR F77_ID;
#else
#define F77_TR &TR
#define F77_ID &ID
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#define F77_ld ld
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if ( Order == CblasColMajor ) // CblasColMajor
{
if ( Trans == CblasNoTrans ) TR = 'N';
else if ( Trans == CblasTrans ) TR = 'T';
else if ( Trans == CblasConjTrans ) TR = 'T';
else
{
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Trans setting, %d\n", Trans);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( Identifier == CblasAMatrix ) ID = 'A';
else if ( Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TR = C2F_CHAR(&TR);
F77_ID = C2F_CHAR(&ID);
#endif
F77_sgemm_pack( F77_ID,
F77_TR,
&F77_M,
&F77_N,
&F77_K,
&alpha,
src, &F77_ld,
dest );
}
else if ( Order == CblasRowMajor ) // CblasRowMajor
{
RowMajorStrg = 1;
if ( Trans == CblasNoTrans ) TR = 'T';
else if ( Trans == CblasTrans ) TR = 'N';
else if ( Trans == CblasConjTrans ) TR = 'N';
else
{
cblas_xerbla(3, "cblas_sgemm_pack","Invalid Trans setting, %d\n", Trans);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if ( Identifier == CblasAMatrix ) ID = 'A';
else if ( Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_TR = C2F_CHAR(&TR);
F77_ID = C2F_CHAR(&ID);
#endif
F77_sgemm_pack ( F77_ID,
F77_TR,
&F77_M,
&F77_N,
&F77_K,
&alpha,
src, &F77_ld,
dest );
}
else cblas_xerbla(1, "cblas_sgemm_pack", "Invalid Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -0,0 +1,83 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
#include "cblas.h"
#include "cblas_f77.h"
f77_int cblas_sgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier,
const f77_int M,
const f77_int N,
const f77_int K )
{
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
char ID;
f77_int tbytes = 0;
#ifdef F77_CHAR
F77_CHAR F77_ID;
#else
#define F77_ID &ID
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_N=N, F77_K=K;
#else
#define F77_M M
#define F77_N N
#define F77_K K
#endif
if ( Identifier == CblasAMatrix ) ID = 'A';
else if ( Identifier == CblasBMatrix ) ID = 'B';
else
{
cblas_xerbla( 1, "cblas_sgemm_pack_get_size",
"Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
return 0;
}
#ifdef F77_CHAR
F77_ID = C2F_CHAR( &ID );
#endif
tbytes = F77_sgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
return tbytes;
}
#endif

View File

@@ -1,4 +1,4 @@
##Copyright (C) 2020, Advanced Micro Devices, Inc.##
##Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. ##
target_sources("${PROJECT_NAME}"
PRIVATE
@@ -23,8 +23,5 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv_check.h
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_check.h
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv_check.h
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m_check.h
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute_check.h
)

View File

@@ -0,0 +1,87 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define bla_gemm_compute_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, rs_c, cs_c ) \
{ \
f77_int info = 0; \
f77_int nota, notb; \
f77_int conja, conjb; \
f77_int ta, tb; \
f77_int packa, packb; \
f77_int nrowa, nrowb; \
\
nota = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
notb = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
ta = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
tb = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
packa = PASTE_LSAME( transa, "P", (ftnlen)1, (ftnlen)1 ); \
packb = PASTE_LSAME( transb, "P", (ftnlen)1, (ftnlen)1 ); \
\
if ( nota || packa ) { nrowa = *m; } \
else { nrowa = *k; } \
if ( notb || packb ) { nrowb = *k; } \
else { nrowb = *n; } \
\
if ( !nota && !conja && !ta && !packa ) \
info = 1; \
else if ( !notb && !conjb && !tb && !packb ) \
info = 2; \
else if ( *m < 0 ) \
info = 3; \
else if ( *n < 0 ) \
info = 4; \
else if ( *k < 0 ) \
info = 5; \
else if ( !packa && *lda < bli_max( 1, nrowa ) ) /* lda is ignored when A is packed. */ \
info = 7; \
else if ( !packb && *ldb < bli_max( 1, nrowb ) ) /* ldb is ignored when B is packed. */ \
info = 9; \
else if ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) \
info = 12; \
\
if ( info != 0 ) \
{ \
char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
\
sprintf( func_str, "%s%-5s", dt_str, op_str ); \
\
bli_string_mkupper( func_str ); \
\
PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
\
return; \
} \
}

View File

@@ -287,6 +287,8 @@
#define dgemm_batch_ dgemm_batch
#define cgemm_batch_ cgemm_batch
#define zgemm_batch_ zgemm_batch
#define sgemm_compute_ sgemm_compute
#define dgemm_compute_ dgemm_compute
#define saxpby_ saxpby
#define daxpby_ daxpby
#define caxpby_ caxpby
@@ -391,6 +393,7 @@
#define dgbmv DGBMV
#define dgemm DGEMM
#define dgemm_batch DGEMM_BATCH
#define dgemm_compute DGEMM_COMPUTE
#define dgemmt DGEMMT
#define dgemv DGEMV
#define dger DGER
@@ -464,6 +467,7 @@
#define sgbmv SGBMV
#define sgemm SGEMM
#define sgemm_batch SGEMM_BATCH
#define sgemm_compute SGEMM_COMPUTE
#define sgemmt SGEMMT
#define sgemv SGEMV
#define sger SGER

View File

@@ -470,7 +470,8 @@ typedef enum
BLIS_NO_TRANSPOSE = 0x0,
BLIS_TRANSPOSE = BLIS_BITVAL_TRANS,
BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ,
BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS
BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS,
BLIS_PACKED = BLIS_BITVAL_PACKED_UNSPEC
} trans_t;
typedef enum

View File

@@ -2,6 +2,8 @@
target_sources("${PROJECT_NAME}"
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_openmp.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_single.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_openmp.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_pthreads.c
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_single.c

View File

@@ -0,0 +1,67 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_COMPUTE_DECOR_H
#define BLIS_L3_COMPUTE_DECOR_H
// Level-3 compute internal function type.
typedef err_t (*l3computeint_t)
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
// Level-3 compute thread decorator prototype.
err_t bli_l3_compute_thread_decorator
(
l3computeint_t func,
opid_t family,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
#include "bli_l3_compute_decor_single.h"
#include "bli_l3_compute_decor_openmp.h"
// #include "bli_l3_compute_decor_pthreads.h"
#endif

View File

@@ -0,0 +1,133 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// @note: Presently MT is not supported, so n_threads have been explicitly
// initialized to 1 while intializing. Thus, even if BLIS is build with OpenMP
// support, the compute APIs work as an ST implementation.
#include "blis.h"
#ifdef BLIS_ENABLE_OPENMP
void* bli_l3_compute_thread_entry( void* data_void ) { return NULL; }
err_t bli_l3_compute_thread_decorator
(
l3computeint_t func,
opid_t family,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_pba_rntm_set_pba( rntm );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
func
(
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
// Free the current thread's thrinfo_t structure.
bli_l3_sup_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
#define BLIS_L3_SUP_DECOR_OPENMP_H
// Definitions specific to situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
#endif
#endif

View File

@@ -0,0 +1,87 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifndef BLIS_ENABLE_MULTITHREADING
err_t bli_l3_compute_thread_decorator
(
l3computeint_t func,
opid_t family,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
const dim_t n_threads = 1;
array_t* restrict array = bli_sba_checkout_array( n_threads );
bli_sba_rntm_set_pool( 0, array, rntm );
bli_pba_rntm_set_pba( rntm );
{
rntm_t* restrict rntm_p = rntm;
const dim_t tid = 0;
// This optimization allows us to use one of the global thrinfo_t
// objects for single-threaded execution rather than grow one from
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
// from within the variants, will immediately return if it detects
// that the thrinfo_t* passed into it is either
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
( void )tid;
func
(
a,
b,
beta,
c,
cntx,
rntm_p,
thread
);
}
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
}
#endif

View File

@@ -0,0 +1,43 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_COMPUTE_DECOR_SINGLE_H
#define BLIS_L3_COMPUTE_DECOR_SINGLE_H
// Definitions specific to situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
#endif
#endif

View File

@@ -54,7 +54,11 @@ void bli_pack_full_thread_decorator
/* Ensure n_threads is always greater than or equal to 1 */
/* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */
/* becoming negative. In that case, packing is done using 1 thread */
n_threads = ( n_threads > 0 ) ? n_threads : 1;
// n_threads = ( n_threads > 0 ) ? n_threads : 1;
// Explicitly setting n_threads = 1 to force packing with only a single
// thread.
n_threads = 1;
_Pragma( "omp parallel num_threads(n_threads)" )
{

View File

@@ -60,6 +60,9 @@
// Include the pack full thread decorator and related definitions and prototypes
// for the pack code path.
#include "bli_pack_full_decor.h"
// Include the level-3 thread decorator and related definitions and prototypes
// for the compute code path.
#include "bli_l3_compute_decor.h"
// Initialization-related prototypes.
void bli_thread_init( void );