mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
BLAS Extension API - ?gemm_compute()
- Added support for 2 new APIs: 1. sgemm_compute() 2. dgemm_compute() These are dependent on the ?gemm_pack_get_size() and ?gemm_pack() APIs. - ?gemm_compute() takes the packed matrix buffer (represented by the packed matrix identifier) and performs the GEMM operation: C := A * B + beta * C. - Whenever the kernel storage preference and the matrix storage scheme isn't matching, and the respective matrix being loaded isn't packed either, on-the-go packing has been enabled for such cases to pack that matrix. - Note: If both the matrices are packed using the ?gemm_pack() API, it is the responsibility of the user to pack only one matrix with alpha scalar and the other with a unit scalar. - Note: Support is presently limited to Single Thread only. Both, pack and compute APIs are forced to take n_threads=1. AMD-Internal: [CPUPL-3560] Change-Id: I825d98a0a5038d31668d2a4b84b3ccc204e6c158
This commit is contained in:
committed by
Arnav Sharma
parent
81161066e5
commit
c8f14edcf5
@@ -6,7 +6,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -193,7 +193,8 @@ blis: \
|
||||
bench_amaxv_blis.x \
|
||||
bench_copyv_blis.x \
|
||||
bench_swapv_blis.x \
|
||||
bench_axpbyv_blis.x
|
||||
bench_axpbyv_blis.x \
|
||||
bench_gemm_pack_compute_blis.x
|
||||
|
||||
openblas: \
|
||||
bench_gemm_openblas.x \
|
||||
@@ -240,7 +241,8 @@ mkl: \
|
||||
bench_amaxv_mkl.x \
|
||||
bench_copyv_mkl.x \
|
||||
bench_swapv_mkl.x \
|
||||
bench_axpbyv_mkl.x
|
||||
bench_axpbyv_mkl.x \
|
||||
bench_gemm_pack_compute_mkl.x
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
930
bench/bench_gemm_pack_compute.c
Executable file
930
bench/bench_gemm_pack_compute.c
Executable file
@@ -0,0 +1,930 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
// Benchmark application to process aocl logs generated by BLIS library.
|
||||
#ifndef DT
|
||||
#define DT BLIS_DOUBLE
|
||||
#endif
|
||||
|
||||
#ifndef IND
|
||||
#define IND BLIS_NAT
|
||||
#endif
|
||||
|
||||
#ifndef N_REPEAT
|
||||
//#define N_REPEAT 100
|
||||
#endif
|
||||
|
||||
|
||||
#define AOCL_MATRIX_INITIALISATION
|
||||
#define BUFFER_SIZE 256
|
||||
|
||||
/* For BLIS since logs are collected at BLAS interfaces
|
||||
* we disable cblas interfaces for this benchmark application
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
// #define CBLAS
|
||||
#endif
|
||||
|
||||
// #define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta, alpha_one;
|
||||
dim_t m, n, k;
|
||||
dim_t p_inc = 0; // to keep track of number of inputs
|
||||
num_t dt;
|
||||
// ind_t ind;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
int packA, packB;
|
||||
|
||||
FILE* fin = NULL;
|
||||
FILE* fout = NULL;
|
||||
|
||||
n_repeats = N_REPEAT; // This macro will get from Makefile.
|
||||
|
||||
dt = DT;
|
||||
|
||||
if (argc < 3)
|
||||
{
|
||||
printf("Usage: ./test_gemm_pack_compute_XX.x input.csv output.csv\n");
|
||||
exit(1);
|
||||
}
|
||||
fin = fopen(argv[1], "r");
|
||||
if (fin == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
fout = fopen(argv[2], "w");
|
||||
if (fout == NULL)
|
||||
{
|
||||
printf("Error opening output file %s\n", argv[2]);
|
||||
exit(1);
|
||||
}
|
||||
if (argc > 3)
|
||||
{
|
||||
n_repeats = atoi(argv[3]);
|
||||
}
|
||||
|
||||
fprintf(fout, "Dt transa transb identifier m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
|
||||
|
||||
// Following variables are needed for scanf to read inputs properly
|
||||
// however they are not used in bench.
|
||||
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
|
||||
char dummy_buffer[BUFFER_SIZE];
|
||||
|
||||
// Variables extracted from the logs which are used by bench
|
||||
char stor_scheme, transA_c, transB_c, packA_c, packB_c;
|
||||
double alpha_r, beta_r, alpha_i, beta_i;
|
||||
dim_t m_trans, n_trans;
|
||||
inc_t lda, ldb, ldc;
|
||||
|
||||
stor_scheme = 'C'; // By default set it to Column Major
|
||||
|
||||
//{S, D, C, Z} transa, transb, packA, packB, m, n, k, alpha_real,
|
||||
// alpha_imag, lda ldb, beta_real, beta_imag, ldc,
|
||||
//
|
||||
// number of threads, execution time, gflops ---> ignored by bench
|
||||
while (fscanf(fin, "%s %c %c %c %c %c " INT_FS INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
|
||||
api_name, &dt_ch, &transA_c, &transB_c, &packA_c, &packB_c, &m, &n, &k, &alpha_r, &alpha_i,
|
||||
&lda, &ldb, &beta_r, &beta_i, &ldc) == 16)
|
||||
{
|
||||
// Discard any extra data on current line in the input file.
|
||||
fgets(dummy_buffer, BUFFER_SIZE, fin );
|
||||
|
||||
// At BLAS level only column major order is supported.
|
||||
stor_scheme = 'C';
|
||||
|
||||
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
|
||||
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
|
||||
else
|
||||
{
|
||||
printf("Invalid data type %c\n", dt_ch);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( transA_c == 'n' || transA_c == 'N' ) transa = BLIS_NO_TRANSPOSE;
|
||||
else if ( transA_c == 't' || transA_c == 'T' ) transa = BLIS_TRANSPOSE;
|
||||
else if ( transA_c == 'c' || transA_c == 'C' ) transa = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for transA \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( transB_c == 'n' || transB_c == 'N' ) transb = BLIS_NO_TRANSPOSE;
|
||||
else if ( transB_c == 't' || transB_c == 'T' ) transb = BLIS_TRANSPOSE;
|
||||
else if ( transB_c == 'c' || transB_c == 'C' ) transb = BLIS_CONJ_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for transB \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( packA_c == 'p' || packA_c == 'P' ) packA = TRUE;
|
||||
else if ( packA_c == 'u' || packA_c == 'U' ) packA = FALSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for packA \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( packB_c == 'p' || packB_c == 'P') packB = TRUE;
|
||||
else if ( packB_c == 'u' || packB_c == 'U') packB = FALSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid option for packB \n");
|
||||
continue;
|
||||
}
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha_one);
|
||||
|
||||
if( (stor_scheme == 'C') || (stor_scheme == 'c') )
|
||||
{
|
||||
// leading dimension should be greater than number of rows
|
||||
// if ((m > lda) || (k > ldb) || (m > ldc)) continue;
|
||||
// Since this bench app is run on logs generated by AOCL trace logs
|
||||
// - we have relaxed the checks on the input parameters.
|
||||
|
||||
// if A is transpose - A(lda x m), lda >= max(1,k)
|
||||
// if A is non-transpose - A (lda x k), lda >= max(1,m)
|
||||
// if B is transpose - B (ldb x k), ldb >= max(1,n)
|
||||
// if B is non-transpose - B (ldb x n), ldb >= max(1,k)
|
||||
// C is ldc x n - ldc >= max(1, m)
|
||||
//if(transa) lda = k; // We will end up overwriting lda
|
||||
bli_set_dims_with_trans( transa, m, k, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, 1, lda, &a);
|
||||
|
||||
//if(transb) ldb = n; // we will end up overwriting ldb, ldb >= n
|
||||
bli_set_dims_with_trans( transb, k, n, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, 1, ldb, &b);
|
||||
|
||||
bli_obj_create( dt, m, n, 1, ldc, &c);
|
||||
bli_obj_create( dt, m, n, 1, ldc, &c_save );
|
||||
}
|
||||
else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
|
||||
{
|
||||
//leading dimension should be greater than number of columns
|
||||
//if ((k > lda) || (n > ldb) || (n > ldc)) continue;
|
||||
// Since this bench app is run on logs generated by AOCL trace logs
|
||||
// - we have relaxed the checks on the input parameters.
|
||||
|
||||
// if A is transpose - A(k x lda), lda >= max(1,m)
|
||||
// if A is non-transpose - A (m x lda), lda >= max(1,k)
|
||||
// if B is transpose - B (n x ldb), ldb >= max(1,k)
|
||||
// if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
|
||||
// C is m x ldc - ldc >= max(1, n)
|
||||
|
||||
//if(transa) lda = m; // this will overwrite lda
|
||||
bli_set_dims_with_trans(transa, m, k, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, lda, 1, &a);
|
||||
|
||||
//if(transb) ldb = k; // this will overwrite ldb
|
||||
bli_set_dims_with_trans(transb, k, n, &m_trans, &n_trans);
|
||||
bli_obj_create( dt, m_trans, n_trans, ldb, 1, &b);
|
||||
|
||||
bli_obj_create( dt, m, n, ldc, 1, &c);
|
||||
bli_obj_create( dt, m, n, ldc, 1, &c_save );
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Invalid storage scheme\n");
|
||||
continue;
|
||||
}
|
||||
#ifndef BLIS // Incase if we are using blis interface we don't have to check for col-storage.
|
||||
#ifndef CBLAS
|
||||
if( ( stor_scheme == 'R' ) || ( stor_scheme == 'r' ) )
|
||||
{
|
||||
printf("BLAS APIs doesn't support row-storage: Enable CBLAS\n");
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef AOCL_MATRIX_INITIALISATION
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
#endif
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a);
|
||||
bli_obj_set_conjtrans( transb, &b);
|
||||
|
||||
bli_setsc( 1.0, 1.0, &alpha_one );
|
||||
bli_setsc( alpha_r, alpha_i, &alpha );
|
||||
bli_setsc( beta_r, beta_i, &beta );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.6f", "" );
|
||||
bli_printm( "b", &b, "%4.6f", "" );
|
||||
bli_printm( "c", &c, "%4.6f", "" );
|
||||
#endif
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
printf( "BLAS Extension APIs don't have a BLIS interface."
|
||||
"Enable CBLAS or BLAS interface!\n" );
|
||||
|
||||
#else
|
||||
|
||||
#ifdef CBLAS
|
||||
enum CBLAS_ORDER cblas_order;
|
||||
enum CBLAS_TRANSPOSE cblas_transa;
|
||||
enum CBLAS_TRANSPOSE cblas_transb;
|
||||
enum CBLAS_IDENTIFIER cblas_identifierA;
|
||||
enum CBLAS_IDENTIFIER cblas_identifierB;
|
||||
|
||||
size_t bufSizeA;
|
||||
size_t bufSizeB;
|
||||
|
||||
if ( ( stor_scheme == 'C' ) || ( stor_scheme == 'c' ) )
|
||||
cblas_order = CblasColMajor;
|
||||
else
|
||||
cblas_order = CblasRowMajor;
|
||||
|
||||
if( bli_is_trans( transa ) )
|
||||
cblas_transa = CblasTrans;
|
||||
else if( bli_is_conjtrans( transa ) )
|
||||
cblas_transa = CblasConjTrans;
|
||||
else
|
||||
cblas_transa = CblasNoTrans;
|
||||
|
||||
if( bli_is_trans( transb ) )
|
||||
cblas_transb = CblasTrans;
|
||||
else if( bli_is_conjtrans( transb ) )
|
||||
cblas_transb = CblasConjTrans;
|
||||
else
|
||||
cblas_transb = CblasNoTrans;
|
||||
|
||||
if ( packA )
|
||||
cblas_identifierA = CblasAMatrix;
|
||||
|
||||
if ( packB )
|
||||
cblas_identifierB = CblasBMatrix;
|
||||
#else
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
f77_char f77_identifierA;
|
||||
f77_char f77_identifierB;
|
||||
f77_int f77_bufSizeA;
|
||||
f77_int f77_bufSizeB;
|
||||
|
||||
f77_char f77_packed = 'P';
|
||||
f77_identifierA = 'A';
|
||||
f77_identifierB = 'B';
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
err_t err = BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
|
||||
float* alphaonep = bli_obj_buffer( &alpha_one );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
#ifdef CBLAS
|
||||
float* aBuffer;
|
||||
float* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
bufSizeA = cblas_sgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (float*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
bufSizeB = cblas_sgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (float*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_sgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphaonep,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is pre-packed.
|
||||
cblas_sgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
#else // -- BLAS API --
|
||||
float* aBuffer;
|
||||
float* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
sgemm_compute_( &f77_packed,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user( aBuffer );
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
sgemm_compute_( &f77_transa,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user( bBuffer );
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
f77_bufSizeB = sgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
|
||||
bBuffer = (float*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
f77_bufSizeA = sgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
|
||||
aBuffer = (float*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
sgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
sgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphaonep,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
sgemm_compute_( &f77_packed,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is reordered.
|
||||
sgemm_compute_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* alphaonep = bli_obj_buffer( &alpha_one );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
#ifdef CBLAS
|
||||
double* aBuffer;
|
||||
double* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
bufSizeA = cblas_dgemm_pack_get_size( CblasAMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
aBuffer = (double*) bli_malloc_user( bufSizeA, &err );
|
||||
|
||||
bufSizeB = cblas_dgemm_pack_get_size( CblasBMatrix,
|
||||
mm,
|
||||
nn,
|
||||
kk );
|
||||
bBuffer = (double*) bli_malloc_user( bufSizeB, &err );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasAMatrix,
|
||||
cblas_transa,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
aBuffer );
|
||||
|
||||
cblas_dgemm_pack( cblas_order,
|
||||
CblasBMatrix,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
bp, ldb,
|
||||
bBuffer );
|
||||
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
CblasPacked,
|
||||
CblasPacked,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
aBuffer, lda,
|
||||
bBuffer, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is pre-packed.
|
||||
cblas_dgemm_compute( cblas_order,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
mm,
|
||||
nn,
|
||||
kk,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
|
||||
#else // -- BLAS API --
|
||||
double* aBuffer;
|
||||
double* bBuffer;
|
||||
|
||||
if ( packA && !packB )
|
||||
{
|
||||
// Only A is pre-packed.
|
||||
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
dgemm_compute_( &f77_packed,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user( aBuffer );
|
||||
}
|
||||
else if ( !packA && packB )
|
||||
{
|
||||
// Only B is pre-packed.
|
||||
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dgemm_compute_( &f77_transa,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user( bBuffer );
|
||||
}
|
||||
else if ( packA && packB )
|
||||
{
|
||||
// Both A & B are pre-packed.
|
||||
f77_bufSizeA = dgemm_pack_get_size_( &f77_identifierA,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
aBuffer = (double*) bli_malloc_user( f77_bufSizeA, &err );
|
||||
|
||||
f77_bufSizeB = dgemm_pack_get_size_( &f77_identifierB,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk );
|
||||
bBuffer = (double*) bli_malloc_user( f77_bufSizeB, &err );
|
||||
|
||||
dgemm_pack_( &f77_identifierA,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap,
|
||||
(f77_int*)&lda,
|
||||
aBuffer );
|
||||
|
||||
dgemm_pack_( &f77_identifierB,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphaonep,
|
||||
bp,
|
||||
(f77_int*)&ldb,
|
||||
bBuffer );
|
||||
|
||||
dgemm_compute_( &f77_packed,
|
||||
&f77_packed,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
aBuffer, (f77_int*)&lda,
|
||||
bBuffer, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
|
||||
bli_free_user(aBuffer);
|
||||
bli_free_user(bBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither A nor B is reordered.
|
||||
dgemm_compute_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
ap, (f77_int*)&lda,
|
||||
bp, (f77_int*)&ldb,
|
||||
betap,
|
||||
cp, (f77_int*)&ldc );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c compute", &c, "%4.6f", "" );
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%cgemm_%s", dt_ch, BLAS );
|
||||
|
||||
p_inc++;
|
||||
printf("( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
(unsigned long)(p_inc),
|
||||
(unsigned long)m,
|
||||
(unsigned long)n,
|
||||
(unsigned long)k, gflops);
|
||||
|
||||
fprintf (fout, "%c %c %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
|
||||
dt_ch, transA_c, transB_c, packA_c, packB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
|
||||
|
||||
fflush(fout);
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
fclose(fin);
|
||||
fclose(fout);
|
||||
|
||||
return 0;
|
||||
}
|
||||
92
bench/inputgemmpackcompute.txt
Normal file
92
bench/inputgemmpackcompute.txt
Normal file
@@ -0,0 +1,92 @@
|
||||
sgemm_ S N N P U 1 1 1 1 0 1 1 1 0 1
|
||||
sgemm_ S N N P U 2 2 2 1 0 2 2 1 0 2
|
||||
sgemm_ S N N P U 3 3 3 1 0 3 3 1 0 3
|
||||
sgemm_ S N N P U 4 4 4 1 0 4 4 1 0 4
|
||||
sgemm_ S N N P U 5 5 5 1 0 5 5 1 0 5
|
||||
sgemm_ S N N P U 6 6 6 1 0 6 6 1 0 6
|
||||
sgemm_ S N N P U 7 7 7 1 0 7 7 1 0 7
|
||||
sgemm_ S N N P U 8 8 8 1 0 8 8 1 0 8
|
||||
sgemm_ S N N P U 9 9 9 1 0 9 9 1 0 9
|
||||
sgemm_ S N N P U 10 10 10 1 0 10 10 1 0 10
|
||||
sgemm_ S N N P U 20 20 20 1 0 20 20 1 0 20
|
||||
sgemm_ S N N P U 30 30 30 1 0 30 30 1 0 30
|
||||
sgemm_ S N N P U 40 40 40 1 0 40 40 1 0 40
|
||||
sgemm_ S N N P U 50 50 50 1 0 50 50 1 0 50
|
||||
sgemm_ S N N P U 60 60 60 1 0 60 60 1 0 60
|
||||
sgemm_ S N N P U 70 70 70 1 0 70 70 1 0 70
|
||||
sgemm_ S N N P U 80 80 80 1 0 80 80 1 0 80
|
||||
sgemm_ S N N P U 90 90 90 1 0 90 90 1 0 90
|
||||
sgemm_ S N N P U 100 100 100 1 0 100 100 1 0 100
|
||||
sgemm_ S N N P U 200 200 200 1 0 200 200 1 0 200
|
||||
sgemm_ S N N P U 300 300 300 1 0 300 300 1 0 300
|
||||
sgemm_ S N N P U 400 400 400 1 0 400 400 1 0 400
|
||||
sgemm_ S N N P U 500 500 500 1 0 500 500 1 0 500
|
||||
dgemm_ D N N P U 1 1 1 1 0 1 1 1 0 1
|
||||
dgemm_ D N N P U 2 2 2 1 0 2 2 1 0 2
|
||||
dgemm_ D N N P U 3 3 3 1 0 3 3 1 0 3
|
||||
dgemm_ D N N P U 4 4 4 1 0 4 4 1 0 4
|
||||
dgemm_ D N N P U 5 5 5 1 0 5 5 1 0 5
|
||||
dgemm_ D N N P U 6 6 6 1 0 6 6 1 0 6
|
||||
dgemm_ D N N P U 7 7 7 1 0 7 7 1 0 7
|
||||
dgemm_ D N N P U 8 8 8 1 0 8 8 1 0 8
|
||||
dgemm_ D N N P U 9 9 9 1 0 9 9 1 0 9
|
||||
dgemm_ D N N P U 10 10 10 1 0 10 10 1 0 10
|
||||
dgemm_ D N N P U 20 20 20 1 0 20 20 1 0 20
|
||||
dgemm_ D N N P U 30 30 30 1 0 30 30 1 0 30
|
||||
dgemm_ D N N P U 40 40 40 1 0 40 40 1 0 40
|
||||
dgemm_ D N N P U 50 50 50 1 0 50 50 1 0 50
|
||||
dgemm_ D N N P U 60 60 60 1 0 60 60 1 0 60
|
||||
dgemm_ D N N P U 70 70 70 1 0 70 70 1 0 70
|
||||
dgemm_ D N N P U 80 80 80 1 0 80 80 1 0 80
|
||||
dgemm_ D N N P U 90 90 90 1 0 90 90 1 0 90
|
||||
dgemm_ D N N P U 100 100 100 1 0 100 100 1 0 100
|
||||
dgemm_ D N N P U 200 200 200 1 0 200 200 1 0 200
|
||||
dgemm_ D N N P U 300 300 300 1 0 300 300 1 0 300
|
||||
dgemm_ D N N P U 400 400 400 1 0 400 400 1 0 400
|
||||
dgemm_ D N N P U 500 500 500 1 0 500 500 1 0 500
|
||||
sgemm_ S N N U P 1 1 1 1 0 1 1 1 0 1
|
||||
sgemm_ S N N U P 2 2 2 1 0 2 2 1 0 2
|
||||
sgemm_ S N N U P 3 3 3 1 0 3 3 1 0 3
|
||||
sgemm_ S N N U P 4 4 4 1 0 4 4 1 0 4
|
||||
sgemm_ S N N U P 5 5 5 1 0 5 5 1 0 5
|
||||
sgemm_ S N N U P 6 6 6 1 0 6 6 1 0 6
|
||||
sgemm_ S N N U P 7 7 7 1 0 7 7 1 0 7
|
||||
sgemm_ S N N U P 8 8 8 1 0 8 8 1 0 8
|
||||
sgemm_ S N N U P 9 9 9 1 0 9 9 1 0 9
|
||||
sgemm_ S N N U P 10 10 10 1 0 10 10 1 0 10
|
||||
sgemm_ S N N U P 20 20 20 1 0 20 20 1 0 20
|
||||
sgemm_ S N N U P 30 30 30 1 0 30 30 1 0 30
|
||||
sgemm_ S N N U P 40 40 40 1 0 40 40 1 0 40
|
||||
sgemm_ S N N U P 50 50 50 1 0 50 50 1 0 50
|
||||
sgemm_ S N N U P 60 60 60 1 0 60 60 1 0 60
|
||||
sgemm_ S N N U P 70 70 70 1 0 70 70 1 0 70
|
||||
sgemm_ S N N U P 80 80 80 1 0 80 80 1 0 80
|
||||
sgemm_ S N N U P 90 90 90 1 0 90 90 1 0 90
|
||||
sgemm_ S N N U P 100 100 100 1 0 100 100 1 0 100
|
||||
sgemm_ S N N U P 200 200 200 1 0 200 200 1 0 200
|
||||
sgemm_ S N N U P 300 300 300 1 0 300 300 1 0 300
|
||||
sgemm_ S N N U P 400 400 400 1 0 400 400 1 0 400
|
||||
sgemm_ S N N U P 500 500 500 1 0 500 500 1 0 500
|
||||
dgemm_ D N N U P 1 1 1 1 0 1 1 1 0 1
|
||||
dgemm_ D N N U P 2 2 2 1 0 2 2 1 0 2
|
||||
dgemm_ D N N U P 3 3 3 1 0 3 3 1 0 3
|
||||
dgemm_ D N N U P 4 4 4 1 0 4 4 1 0 4
|
||||
dgemm_ D N N U P 5 5 5 1 0 5 5 1 0 5
|
||||
dgemm_ D N N U P 6 6 6 1 0 6 6 1 0 6
|
||||
dgemm_ D N N U P 7 7 7 1 0 7 7 1 0 7
|
||||
dgemm_ D N N U P 8 8 8 1 0 8 8 1 0 8
|
||||
dgemm_ D N N U P 9 9 9 1 0 9 9 1 0 9
|
||||
dgemm_ D N N U P 10 10 10 1 0 10 10 1 0 10
|
||||
dgemm_ D N N U P 20 20 20 1 0 20 20 1 0 20
|
||||
dgemm_ D N N U P 30 30 30 1 0 30 30 1 0 30
|
||||
dgemm_ D N N U P 40 40 40 1 0 40 40 1 0 40
|
||||
dgemm_ D N N U P 50 50 50 1 0 50 50 1 0 50
|
||||
dgemm_ D N N U P 60 60 60 1 0 60 60 1 0 60
|
||||
dgemm_ D N N U P 70 70 70 1 0 70 70 1 0 70
|
||||
dgemm_ D N N U P 80 80 80 1 0 80 80 1 0 80
|
||||
dgemm_ D N N U P 90 90 90 1 0 90 90 1 0 90
|
||||
dgemm_ D N N U P 100 100 100 1 0 100 100 1 0 100
|
||||
dgemm_ D N N U P 200 200 200 1 0 200 200 1 0 200
|
||||
dgemm_ D N N U P 300 300 300 1 0 300 300 1 0 300
|
||||
dgemm_ D N N U P 400 400 400 1 0 400 400 1 0 400
|
||||
dgemm_ D N N U P 500 500 500 1 0 500 500 1 0 500
|
||||
@@ -1,4 +1,4 @@
|
||||
##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
@@ -26,12 +26,13 @@ target_sources("${PROJECT_NAME}"
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_oapi.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_ukr_tapi.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_smart_threading.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute.c
|
||||
)
|
||||
# Select AMD specific sources for AMD configurations.
|
||||
if(${TARGET_ARCH} STREQUAL zen OR
|
||||
${TARGET_ARCH} STREQUAL zen2 OR
|
||||
if(${TARGET_ARCH} STREQUAL zen OR
|
||||
${TARGET_ARCH} STREQUAL zen2 OR
|
||||
${TARGET_ARCH} STREQUAL zen3 OR
|
||||
${TARGET_ARCH} STREQUAL zen4 OR
|
||||
${TARGET_ARCH} STREQUAL zen4 OR
|
||||
${TARGET_ARCH} STREQUAL amdzen)
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020-22, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -105,3 +105,6 @@
|
||||
|
||||
// Smart Threading API's.
|
||||
#include "bli_l3_smart_threading.h"
|
||||
|
||||
// BLAS Extension API - Compute
|
||||
#include "bli_l3_compute.h"
|
||||
637
frame/3/bli_l3_compute.c
Normal file
637
frame/3/bli_l3_compute.c
Normal file
@@ -0,0 +1,637 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_compute_init
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
{
|
||||
// @todo: Add call to error checking function here
|
||||
}
|
||||
|
||||
// Initializing the cntx if one isn't already passed.
|
||||
if ( cntx == NULL ) {
|
||||
cntx = bli_gks_query_cntx();
|
||||
}
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL )
|
||||
{
|
||||
bli_rntm_init_from_global( &rntm_l );
|
||||
rntm = &rntm_l;
|
||||
}
|
||||
else
|
||||
{
|
||||
rntm_l = *rntm;
|
||||
rntm = &rntm_l;
|
||||
}
|
||||
|
||||
// @todo: AOCL Dynamic yet to be implemented for pack-compute APIs.
|
||||
#ifdef AOCL_DYNAMIC
|
||||
// If dynamic-threading is enabled, calculate optimum number
|
||||
// of threads.
|
||||
// rntm will be updated with optimum number of threads.
|
||||
|
||||
// bli_nthreads_optimum(a, b, c, BLIS_GEMM, rntm );
|
||||
#endif
|
||||
|
||||
// Explicitly set n_threads=1 and update rntm since only ST supported.
|
||||
dim_t n_threads = 1;
|
||||
bli_rntm_set_num_threads( n_threads, rntm );
|
||||
bli_rntm_set_ways_from_rntm_sup
|
||||
(
|
||||
bli_obj_length( c ),
|
||||
bli_obj_width( c ),
|
||||
bli_obj_width( a ),
|
||||
rntm
|
||||
);
|
||||
|
||||
bli_l3_compute_thread_decorator
|
||||
(
|
||||
bli_gemm_compute,
|
||||
BLIS_GEMM,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
err_t bli_gemm_compute
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t rs_a;
|
||||
inc_t cs_a;
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b;
|
||||
inc_t cs_b;
|
||||
|
||||
stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
// packedX defines whether matrix X is pre-packed (reordered) or not.
|
||||
bool packeda = bli_obj_is_packed( a );
|
||||
bool packedb = bli_obj_is_packed( b );
|
||||
|
||||
// packX defines whether to pack matrix X on-the-go or not.
|
||||
bool packa = bli_rntm_pack_a( rntm );
|
||||
bool packb = bli_rntm_pack_b( rntm );
|
||||
const bool transa = bli_obj_has_trans( a );
|
||||
const bool transb = bli_obj_has_trans( b );
|
||||
|
||||
// is_col_stored_a = TRUE when,
|
||||
// A is col stored and not transposed,
|
||||
// or, A is row stored and transposed.
|
||||
const bool is_col_stored_a = bli_obj_is_col_stored( a ) && !transa;
|
||||
|
||||
// is_row_stored_b = TRUE when,
|
||||
// B is row stored and not transposed,
|
||||
// or, B is col stored and transposed.
|
||||
const bool is_row_stored_b = bli_obj_is_row_stored( b ) && !transb;
|
||||
|
||||
// If kernel is row-preferred but B is not row-stored and unpacked,
|
||||
// enable on-the-go packing of B.
|
||||
// Else if kernel is col-preferred but A is not col-stored and unpacked,
|
||||
// enable on-the-go packing of A.
|
||||
if ( row_pref )
|
||||
{
|
||||
if ( !packedb && !is_row_stored_b ) packb = TRUE;
|
||||
}
|
||||
else // if ( col_pref )
|
||||
{
|
||||
if ( !packeda && !is_col_stored_a ) packa = TRUE;
|
||||
}
|
||||
|
||||
if ( bli_obj_has_notrans( a ) )
|
||||
{
|
||||
k = bli_obj_width( a );
|
||||
|
||||
rs_a = bli_obj_row_stride( a );
|
||||
cs_a = bli_obj_col_stride( a );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( a ) )
|
||||
{
|
||||
// Assign the variables with an implicit transposition.
|
||||
k = bli_obj_length( a );
|
||||
|
||||
rs_a = bli_obj_col_stride( a );
|
||||
cs_a = bli_obj_row_stride( a );
|
||||
}
|
||||
|
||||
if ( bli_obj_has_notrans( b ) )
|
||||
{
|
||||
rs_b = bli_obj_row_stride( b );
|
||||
cs_b = bli_obj_col_stride( b );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( b ) )
|
||||
{
|
||||
rs_b = bli_obj_col_stride( b );
|
||||
cs_b = bli_obj_row_stride( b );
|
||||
}
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
|
||||
|
||||
// Setting the packing status in rntm.
|
||||
if ( packa ) bli_rntm_set_pack_a( 1, rntm );
|
||||
else bli_rntm_set_pack_a( 0, rntm );
|
||||
|
||||
if ( packb ) bli_rntm_set_pack_b( 1, rntm );
|
||||
else bli_rntm_set_pack_b( 0, rntm );
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
PASTEMAC( s, gemm_compute )
|
||||
(
|
||||
packa,
|
||||
packb,
|
||||
packeda,
|
||||
packedb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels.
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
PASTEMAC( d, gemm_compute )
|
||||
(
|
||||
packa,
|
||||
packb,
|
||||
packeda,
|
||||
packedb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
BLIS_RRR, // Using BLIS_RRR since we want to redirect to m kernels.
|
||||
cntx,
|
||||
rntm,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC( ch, varname ) \
|
||||
( \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
bool packeda, \
|
||||
bool packedb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
stor3_t stor_id, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC( ch, type ); \
|
||||
\
|
||||
/* If m or n is zero, return immediately. */ \
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* @todo Add early return for k < 1 or alpha = 0 here. */ \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
/* @note: Modifications of KC are just a part of optimizations.
|
||||
Such optimizations have been removed for simplicity and will be a part
|
||||
of the optimizations patch. */ \
|
||||
dim_t KC; \
|
||||
KC = KC0; \
|
||||
\
|
||||
/* Query the maximum blocksize for NR, which implies a maximum blocksize
|
||||
extension for the final iteration. */ \
|
||||
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t NRE = NRM - NR; \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c; \
|
||||
const inc_t jcstep_b = cs_b; \
|
||||
\
|
||||
const inc_t jcstep_b_use = k; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a; \
|
||||
const inc_t pcstep_b = rs_b; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c; \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t pcstep_a_use = ( ( m + MR - 1 ) / MR ) * MR; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
PASTECH(ch,gemmsup_ker_ft) \
|
||||
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
/* Make local copies of beta and one scalars to prevent any unnecessary
|
||||
sharing of cache lines between the cores' caches. */ \
|
||||
ctype beta_local = *beta_cast; \
|
||||
ctype one_local = *PASTEMAC(ch,1); \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
|
||||
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t* restrict bszids; \
|
||||
\
|
||||
/* Set the bszids pointer to the correct bszids array above based on which
|
||||
matrices (if any) are being packed. */ \
|
||||
\
|
||||
if ( packa ) { if ( packb ) bszids = bszids_packab; \
|
||||
else bszids = bszids_packa; } \
|
||||
else { if ( packb ) bszids = bszids_packb; \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jc = bszids; \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current JC block dimension. */ \
|
||||
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
|
||||
const inc_t pcstep_b_use = ( ( nc_cur + NR - 1 ) / NR ) * NR; \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict b_jc_use = b_00 + jj * jcstep_b_use; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_pc = &bszids_jc[1]; \
|
||||
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
|
||||
\
|
||||
/* Compute the PC loop thread range for the current thread. */ \
|
||||
const dim_t pc_start = 0, pc_end = k; \
|
||||
const dim_t k_local = k; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the PC loop. */ \
|
||||
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
|
||||
const dim_t pc_left = k_local % KC; \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
|
||||
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current PC block dimension. */ \
|
||||
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
|
||||
const inc_t icstep_a_use = kc_cur; \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
ctype* restrict b_pc_use; \
|
||||
ctype* restrict a_pc_use = a_00 + pp * pcstep_a_use; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing B, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pb variables. Note that *if* we will be packing
|
||||
B, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pb; \
|
||||
if ( packb ) { bszids_pb = &bszids_pc[1]; \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
|
||||
else { bszids_pb = &bszids_pc[0]; \
|
||||
thread_pb = thread_pc; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then a_use will be set to point to
|
||||
b and the _b_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
\
|
||||
/* packedb == TRUE indicates that B is reordered thus, update the
|
||||
necessary pointers.
|
||||
Else, call packm routine to pack B on-the-go. */ \
|
||||
if ( packedb ) \
|
||||
{ \
|
||||
rs_b_use = NR; \
|
||||
cs_b_use = 1; \
|
||||
ps_b_use = kc_cur * NR; \
|
||||
b_pc_use = b_jc_use + pp * pcstep_b_use; \
|
||||
} else \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, \
|
||||
stor_id, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
KC, NC, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
b_pc_use = b_use; \
|
||||
} \
|
||||
\
|
||||
/* We don't need to embed the panel stride of B within the auxinfo_t
|
||||
object because this variant iterates through B in the jr loop,
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
|
||||
{ \
|
||||
/* Calculate the thread's current IC block dimension. */ \
|
||||
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
ctype* restrict a_ic_use; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing A, we alias to
|
||||
the _ic variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pa; \
|
||||
if ( packa ) { bszids_pa = &bszids_ic[1]; \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
|
||||
else { bszids_pa = &bszids_ic[0]; \
|
||||
thread_pa = thread_ic; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
/* packedb == TRUE indicates that B is reordered thus, update the
|
||||
necessary pointers.
|
||||
Else, call packm routine to pack B on-the-go. */ \
|
||||
if ( packeda ) \
|
||||
{ \
|
||||
rs_a_use = 1; \
|
||||
cs_a_use = MR; \
|
||||
ps_a_use = MR * kc_cur; \
|
||||
a_ic_use = a_pc_use + ii * icstep_a_use; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
|
||||
stor_id, /* a "block of A." */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
a_ic_use = a_use; \
|
||||
} \
|
||||
\
|
||||
/* Embed the panel stride of A within the auxinfo_t object. The
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of A (if needed). */ \
|
||||
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. NOTE: We forgo this optimization when packing B
|
||||
since packing an extended edge case is not yet supported. */ \
|
||||
if ( !packb && !is_mt ) \
|
||||
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += NR; \
|
||||
} \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
|
||||
{ \
|
||||
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
{ \
|
||||
/* Invoke the gemmsup millikernel. */ \
|
||||
gemmsup_ker \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
mc_cur, \
|
||||
nr_cur, \
|
||||
kc_cur, \
|
||||
&one_local, \
|
||||
a_ic_use, rs_a_use, cs_a_use, \
|
||||
b_jr, rs_b_use, cs_b_use, \
|
||||
beta_use, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This barrier is only needed if we are packing B (since
|
||||
that matrix is packed within the pc loop of this variant). */ \
|
||||
if ( packb ) bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||
PASTEMAC(ch,packm_sup_finalize_mem_a) \
|
||||
( \
|
||||
packa, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
PASTEMAC(ch,packm_sup_finalize_mem_b) \
|
||||
( \
|
||||
packb, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_SD( gemm_compute )
|
||||
80
frame/3/bli_l3_compute.h
Normal file
80
frame/3/bli_l3_compute.h
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm_compute_init
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
err_t bli_gemm_compute
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC( ch, varname ) \
|
||||
( \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
bool packeda, \
|
||||
bool packedb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
stor3_t stor_id, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm_compute )
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -84,6 +84,7 @@ BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_t
|
||||
if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
|
||||
else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
|
||||
else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
|
||||
else if ( trans == 'p' || trans == 'P' ) *blis_trans = BLIS_PACKED;
|
||||
else
|
||||
{
|
||||
// See comment for bli_param_map_netlib_to_blis_side() above.
|
||||
|
||||
@@ -30,11 +30,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_imatcopy.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_omatcopy2.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_omatadd.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute.c
|
||||
)
|
||||
|
||||
# Select AMD specific sources for AMD configurations.
|
||||
if(${TARGET_ARCH} STREQUAL zen OR
|
||||
${TARGET_ARCH} STREQUAL zen2 OR
|
||||
if(${TARGET_ARCH} STREQUAL zen OR
|
||||
${TARGET_ARCH} STREQUAL zen2 OR
|
||||
${TARGET_ARCH} STREQUAL zen3 OR
|
||||
${TARGET_ARCH} STREQUAL zen4 OR
|
||||
${TARGET_ARCH} STREQUAL amdzen)
|
||||
@@ -49,8 +52,6 @@ ${TARGET_ARCH} STREQUAL amdzen)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_scal_amd.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_swap_amd.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_amd.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack_get_size.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_pack.c
|
||||
)
|
||||
else()
|
||||
target_sources("${PROJECT_NAME}"
|
||||
|
||||
285
frame/compat/bla_gemm_compute.c
Normal file
285
frame/compat/bla_gemm_compute.c
Normal file
@@ -0,0 +1,285 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// BLAS Extension APIs
|
||||
/* ?gemm_compute.h */
|
||||
/* BLAS interface to compute matrix-matrix product */
|
||||
/* Datatype : s & d (single and double precision only supported) */
|
||||
/* BLAS Extensions */
|
||||
/* output is the gemm result */
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void sgemm_compute_blis_impl
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const float* a, const f77_int* rs_a, const f77_int* cs_a,
|
||||
const float* b, const f77_int* rs_b, const f77_int* cs_b,
|
||||
const float* beta,
|
||||
float* c, const f77_int* rs_c, const f77_int* cs_c
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
// @todo: Add AOCL DTL logs
|
||||
// AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k,
|
||||
// (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm_compute)
|
||||
(
|
||||
MKSTR(s),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
( ( *rs_a != 1 ) ? rs_a : cs_a ),
|
||||
( ( *rs_b != 1 ) ? rs_b : cs_b ),
|
||||
rs_c, cs_c
|
||||
);
|
||||
|
||||
/* Quick return if possible. */
|
||||
if ( *m == 0 || *n == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1(*m, m0);
|
||||
bli_convert_blas_dim1(*n, n0);
|
||||
bli_convert_blas_dim1(*k, k0);
|
||||
|
||||
const num_t dt = BLIS_FLOAT;
|
||||
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
|
||||
|
||||
bli_obj_init_finish_1x1( dt, (float*)beta, &betao );
|
||||
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (float*)a, *rs_a, *cs_a, &ao );
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (float*)b, *rs_b, *cs_b, &bo );
|
||||
bli_obj_init_finish( dt, m0, n0, (float*)c, *rs_c, *cs_c, &co );
|
||||
|
||||
bli_obj_set_conjtrans( blis_transa, &ao );
|
||||
bli_obj_set_conjtrans( blis_transb, &bo );
|
||||
|
||||
PASTEMAC0( gemm_compute_init )
|
||||
(
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void sgemm_compute_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const float* a, const f77_int* lda,
|
||||
const float* b, const f77_int* ldb,
|
||||
const float* beta,
|
||||
float* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int rs_c = 1;
|
||||
sgemm_compute_blis_impl( transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
a, &rs_a, lda,
|
||||
b, &rs_b, ldb,
|
||||
beta,
|
||||
c, &rs_c, ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
void dgemm_compute_blis_impl
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const double* a, const f77_int* rs_a, const f77_int* cs_a,
|
||||
const double* b, const f77_int* rs_b, const f77_int* cs_b,
|
||||
const double* beta,
|
||||
double* c, const f77_int* rs_c, const f77_int* cs_c
|
||||
)
|
||||
{
|
||||
trans_t blis_transa;
|
||||
trans_t blis_transb;
|
||||
dim_t m0, n0, k0;
|
||||
dim_t m0_a, n0_a;
|
||||
dim_t m0_b, n0_b;
|
||||
|
||||
/* Initialize BLIS. */
|
||||
bli_init_auto();
|
||||
|
||||
// @todo: Add AOCL DTL logs
|
||||
// AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
|
||||
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k,
|
||||
// (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
|
||||
|
||||
/* Perform BLAS parameter checking. */
|
||||
PASTEBLACHK(gemm_compute)
|
||||
(
|
||||
MKSTR(d),
|
||||
MKSTR(gemm),
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
( ( *rs_a != 1 ) ? rs_a : cs_a ),
|
||||
( ( *rs_b != 1 ) ? rs_b : cs_b ),
|
||||
rs_c, cs_c
|
||||
);
|
||||
|
||||
/* Quick return if possible. */
|
||||
if ( *m == 0 || *n == 0 )
|
||||
{
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
return;
|
||||
}
|
||||
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
|
||||
|
||||
/* Typecast BLAS integers to BLIS integers. */
|
||||
bli_convert_blas_dim1(*m, m0);
|
||||
bli_convert_blas_dim1(*n, n0);
|
||||
bli_convert_blas_dim1(*k, k0);
|
||||
|
||||
const num_t dt = BLIS_DOUBLE;
|
||||
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER;
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1;
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER;
|
||||
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a );
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b );
|
||||
|
||||
bli_obj_init_finish_1x1( dt, (double*)beta, &betao );
|
||||
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (double*)a, *rs_a, *cs_a, &ao );
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (double*)b, *rs_b, *cs_b, &bo );
|
||||
bli_obj_init_finish( dt, m0, n0, (double*)c, *rs_c, *cs_c, &co );
|
||||
|
||||
bli_obj_set_conjtrans( blis_transa, &ao );
|
||||
bli_obj_set_conjtrans( blis_transb, &bo );
|
||||
|
||||
PASTEMAC0( gemm_compute_init )
|
||||
(
|
||||
&ao,
|
||||
&bo,
|
||||
&betao,
|
||||
&co,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
bli_finalize_auto();
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
BLIS_EXPORT_BLAS void dgemm_compute_
|
||||
(
|
||||
const f77_char* transa,
|
||||
const f77_char* transb,
|
||||
const f77_int* m,
|
||||
const f77_int* n,
|
||||
const f77_int* k,
|
||||
const double* a, const f77_int* lda,
|
||||
const double* b, const f77_int* ldb,
|
||||
const double* beta,
|
||||
double* c, const f77_int* ldc
|
||||
)
|
||||
{
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int rs_c = 1;
|
||||
dgemm_compute_blis_impl( transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
a, &rs_a, lda,
|
||||
b, &rs_b, ldb,
|
||||
beta,
|
||||
c, &rs_c, ldc );
|
||||
}
|
||||
#endif
|
||||
72
frame/compat/bla_gemm_compute.h
Normal file
72
frame/compat/bla_gemm_compute.h
Normal file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// BLAS Extension APIs
|
||||
/* ?gemm_compute.h */
|
||||
/* BLAS interface to compute matrix-matrix product */
|
||||
/* Datatype : s & d (single and double precision only supported) */
|
||||
/* BLAS Extensions */
|
||||
/* output is the gemm result */
|
||||
|
||||
#undef GENTPROTRO
|
||||
#define GENTPROTRO( ftype, ch, blasname ) \
|
||||
\
|
||||
IF_BLIS_ENABLE_BLAS(\
|
||||
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
); \
|
||||
)\
|
||||
BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
|
||||
( \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* a, const f77_int* rs_a, const f77_int* cs_a, \
|
||||
const ftype* b, const f77_int* rs_b, const f77_int* cs_b, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* rs_c, const f77_int* cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTRO_BLAS( gemm_compute )
|
||||
@@ -183,6 +183,7 @@
|
||||
#include "bla_trmm.h"
|
||||
#include "bla_trsm.h"
|
||||
#include "bla_gemmt.h"
|
||||
#include "bla_gemm_compute.h"
|
||||
|
||||
#include "bla_gemm_check.h"
|
||||
#include "bla_hemm_check.h"
|
||||
@@ -194,6 +195,7 @@
|
||||
#include "bla_trmm_check.h"
|
||||
#include "bla_trsm_check.h"
|
||||
#include "bla_gemmt_check.h"
|
||||
#include "bla_gemm_compute_check.h"
|
||||
|
||||
// -- Batch Extension prototypes --
|
||||
#include "bla_gemm_batch.h"
|
||||
|
||||
@@ -48,6 +48,8 @@ enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
|
||||
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
|
||||
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
|
||||
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
|
||||
enum CBLAS_STORAGE {CblasPacked=151};
|
||||
enum CBLAS_IDENTIFIER {CblasAMatrix=161, CblasBMatrix=162};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -993,6 +995,190 @@ BLIS_EXPORT_BLAS f77_int cblas_idamin(f77_int N, const double *X, f77_int incX);
|
||||
BLIS_EXPORT_BLAS f77_int cblas_icamin(f77_int N, const void *X, f77_int incX);
|
||||
BLIS_EXPORT_BLAS f77_int cblas_izamin(f77_int N, const void *X, f77_int incX);
|
||||
|
||||
|
||||
// -- PACK COMPUTE APIs --
|
||||
/** \addtogroup INTERFACE CBLAS INTERFACE
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* cblas_sgemm_pack_get_size calculates and returns the number of bytes necessary
|
||||
* to store the specified matrix after packing.
|
||||
*
|
||||
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @return The size in bytes required to store the specified matrix after packing.
|
||||
*/
|
||||
BLIS_EXPORT_BLAS f77_int cblas_sgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
|
||||
const f77_int M, const f77_int N, const f77_int K);
|
||||
|
||||
/**
|
||||
* cblas_dgemm_pack_get_size calculates and returns the number of bytes necessary
|
||||
* to store the specified matrix after packing.
|
||||
*
|
||||
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @return The size in bytes required to store the specified matrix after packing.
|
||||
*/
|
||||
BLIS_EXPORT_BLAS f77_int cblas_dgemm_pack_get_size(enum CBLAS_IDENTIFIER Identifier,
|
||||
const f77_int M, const f77_int N, const f77_int K);
|
||||
|
||||
/**
|
||||
* cblas_sgemm_pack scales by alpha and packs the specified matrix into the
|
||||
* allocated buffer. It is imperative to allocate a buffer of type float and size
|
||||
* as returned by the cblas_sgemm_pack_get_size() before invoking this routine.
|
||||
*
|
||||
* @note If both the matrices are to be packed, the user must ensure that only
|
||||
* one matrix is packed with the scalar alpha and the other with a unit-scalar.
|
||||
*
|
||||
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
|
||||
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
|
||||
* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
|
||||
* if trans = CblasNoTrans, then Mat(X) = X;
|
||||
* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
|
||||
* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @param[in] alpha Specifies the scalar alpha.
|
||||
* @param[in] src The matrix to be packed.
|
||||
* @param[in] ld Specifies the leading dimension of the matrix to be packed.
|
||||
* @param[out] dest The buffer to store the scaled and packed matrix.
|
||||
* @return None
|
||||
*/
|
||||
BLIS_EXPORT_BLAS void cblas_sgemm_pack(enum CBLAS_ORDER Order,
|
||||
enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
|
||||
const f77_int M, const f77_int N, const f77_int K,
|
||||
const float alpha, const float *src, const f77_int ld,
|
||||
float* dest );
|
||||
|
||||
/**
|
||||
* cblas_dgemm_pack scales by alpha and packs the specified matrix into the
|
||||
* allocated buffer. It is imperative to allocate a buffer of type double and
|
||||
* size as returned by the cblas_dgemm_pack_get_size() before invoking this
|
||||
* routine.
|
||||
*
|
||||
* @note If both the matrices are to be packed, the user must ensure that only
|
||||
* one matrix is packed with the scalar alpha and the other with a unit-scalar.
|
||||
*
|
||||
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
|
||||
* @param[in] Identifier Specifies the matrix to be packed. CblasAMatrix or CblasBMatrix.
|
||||
* @param[in] Trans Specifies the form of Mat(X) used in the matrix multiplication:
|
||||
* if trans = CblasNoTrans, then Mat(X) = X;
|
||||
* if trans = CblasTrans, then Mat(X) = \f$X^T\f$;
|
||||
* if trans = CblasConjTrans, then Mat(X) = \f$X^H\f$.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @param[in] alpha Specifies the scalar alpha.
|
||||
* @param[in] src The matrix to be packed.
|
||||
* @param[in] ld Specifies the leading dimension of the matrix to be packed.
|
||||
* @param[out] dest The buffer to store the scaled and packed matrix.
|
||||
* @return None
|
||||
*/
|
||||
BLIS_EXPORT_BLAS void cblas_dgemm_pack(enum CBLAS_ORDER Order,
|
||||
enum CBLAS_IDENTIFIER Identifier, enum CBLAS_TRANSPOSE Trans,
|
||||
const f77_int M, const f77_int N, const f77_int K,
|
||||
const double alpha, const double *src, const f77_int ld,
|
||||
double* dest );
|
||||
|
||||
/**
|
||||
* cblas_sgemm_compute computes the matrix-matrix product where one or both the
|
||||
* input matrices are packed and adds this to the scalar-matrix product. This
|
||||
* operation is defined as:
|
||||
* C := Mat(A) * Mat(B) + beta*C,
|
||||
* where,
|
||||
* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
|
||||
* beta is a scalar,
|
||||
* A, B and C are matrices:
|
||||
* Mat(A) is an nxk matrix, or a packed matrix buffer,
|
||||
* Mat(B) is a kxn matrix, or a packed matrix buffer,
|
||||
* C is an mxn matrix.
|
||||
*
|
||||
* @note In case both the matrices are to be packed, the user must ensure that
|
||||
* only one matrix is packed with alpha scalar and the other with a unit-scalar,
|
||||
* during the packing process
|
||||
*
|
||||
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
|
||||
* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
|
||||
* if transa = CblasNoTrans, then Mat(A) = A;
|
||||
* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
|
||||
* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
|
||||
* if transa = CblasPacked, then A matrix is packed and lda is ignored.
|
||||
* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
|
||||
* if transb = CblasNoTrans, then Mat(B) = B;
|
||||
* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
|
||||
* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
|
||||
* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @param[in] A The array is float matrix A or a buffer with packed matrix A.
|
||||
* @param[in] lda Specifies the leading dimension of A.
|
||||
* @param[in] B The array is float matrix B or a buffer with packed matrix B.
|
||||
* @param[in] ldb Specifies the leading dimension of B.
|
||||
* @param[in] beta Specifies the scalar beta.
|
||||
* @param[in,out] C The array is float matrix C.
|
||||
* @param[in] ldc Specifies the leading dimension of C.
|
||||
* @return None
|
||||
*/
|
||||
BLIS_EXPORT_BLAS void cblas_sgemm_compute(enum CBLAS_ORDER Order,
|
||||
f77_int TransA, f77_int TransB,
|
||||
const f77_int M, const f77_int N, const f77_int K,
|
||||
const float* A, f77_int lda, const float* B, f77_int ldb,
|
||||
float beta, float* C, f77_int ldc);
|
||||
|
||||
/**
|
||||
* cblas_dgemm_compute computes the matrix-matrix product where one or both the
|
||||
* input matrices are packed and adds this to the scalar-matrix product. This
|
||||
* operation is defined as:
|
||||
* C := Mat(A) * Mat(B) + beta*C,
|
||||
* where,
|
||||
* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
|
||||
* beta is a scalar,
|
||||
* A, B and C are matrices:
|
||||
* Mat(A) is an nxk matrix, or a packed matrix buffer,
|
||||
* Mat(B) is a kxn matrix, or a packed matrix buffer,
|
||||
* C is an mxn matrix.
|
||||
*
|
||||
* @note In case both the matrices are to be packed, the user must ensure that
|
||||
* only one matrix is packed with alpha scalar and the other with a unit-scalar,
|
||||
* during the packing process
|
||||
*
|
||||
* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor.
|
||||
* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
|
||||
* if transa = CblasNoTrans, then Mat(A) = A;
|
||||
* if transa = CblasTrans, then Mat(A) = \f$A^T\f$;
|
||||
* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$;
|
||||
* if transa = CblasPacked, then A matrix is packed and lda is ignored.
|
||||
* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
|
||||
* if transb = CblasNoTrans, then Mat(B) = B;
|
||||
* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
|
||||
* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$;
|
||||
* if transb = CblasPacked, then B matrix is packed and ldb is ignored.
|
||||
* @param[in] M Specifies the number of rows of the matrix Mat(A) and the number of columns of the matrix Mat(B).
|
||||
* @param[in] N Specifies the order of the matrix C.
|
||||
* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
|
||||
* @param[in] A The array is double matrix A or a buffer with packed matrix A.
|
||||
* @param[in] lda Specifies the leading dimension of A.
|
||||
* @param[in] B The array is double matrix B or a buffer with packed matrix B.
|
||||
* @param[in] ldb Specifies the leading dimension of B.
|
||||
* @param[in] beta Specifies the scalar beta.
|
||||
* @param[in,out] C The array is double matrix C.
|
||||
* @param[in] ldc Specifies the leading dimension of C.
|
||||
* @return None
|
||||
*/
|
||||
BLIS_EXPORT_BLAS void cblas_dgemm_compute(enum CBLAS_ORDER Order,
|
||||
f77_int TransA, f77_int TransB,
|
||||
const f77_int M, const f77_int N, const f77_int K,
|
||||
const double* A, f77_int lda, const double* B, f77_int ldb,
|
||||
double beta, double* C, f77_int ldc);
|
||||
/** @}*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
172
frame/compat/cblas/src/cblas_dgemm_compute.c
Normal file
172
frame/compat/cblas/src/cblas_dgemm_compute.c
Normal file
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
BLIS_EXPORT_BLAS void cblas_dgemm_compute( enum CBLAS_ORDER Order,
|
||||
f77_int TransA,
|
||||
f77_int TransB,
|
||||
const f77_int M, const f77_int N,
|
||||
const f77_int K,
|
||||
const double* A, f77_int lda,
|
||||
const double* B, f77_int ldb,
|
||||
double beta,
|
||||
double* C, f77_int ldc )
|
||||
{
|
||||
char TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if ( Order == CblasColMajor ) // CblasColMajor
|
||||
{
|
||||
if ( TransA == CblasTrans ) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='T';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else if ( TransA == CblasPacked ) TA='P';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemm_compute",
|
||||
"Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( TransB == CblasTrans ) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='T';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else if ( TransB == CblasPacked ) TB='P';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemm_compute",
|
||||
"Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int rs_c = 1;
|
||||
|
||||
F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
|
||||
B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
|
||||
}
|
||||
else if ( Order == CblasRowMajor ) // CblasRowMajor
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
|
||||
// If Row Major, and A is not already reordered
|
||||
// then toggle the transA parameter and interchange the strides.
|
||||
if ( TransA == CblasPacked ) TA='P';
|
||||
else if ( TransA == CblasTrans ) TA='N';
|
||||
else if ( TransA == CblasNoTrans ) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemm_compute",
|
||||
"Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// If Row Major, and B is not already reordered
|
||||
// then toggle the transB parameter and interchange the strides.
|
||||
if ( TransB == CblasPacked ) TB='P';
|
||||
else if ( TransB == CblasTrans ) TB='N';
|
||||
else if ( TransB == CblasNoTrans ) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemm_compute",
|
||||
"Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int cs_c = 1;
|
||||
|
||||
F77_dgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
|
||||
B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c );
|
||||
}
|
||||
else
|
||||
{
|
||||
cblas_xerbla(1, "cblas_dgemm_compute",
|
||||
"Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
157
frame/compat/cblas/src/cblas_dgemm_pack.c
Normal file
157
frame/compat/cblas/src/cblas_dgemm_pack.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
BLIS_EXPORT_BLAS void cblas_dgemm_pack( enum CBLAS_ORDER Order,
|
||||
enum CBLAS_IDENTIFIER Identifier,
|
||||
enum CBLAS_TRANSPOSE Trans,
|
||||
const f77_int M,
|
||||
const f77_int N,
|
||||
const f77_int K,
|
||||
const double alpha,
|
||||
const double* src, const f77_int ld,
|
||||
double* dest )
|
||||
{
|
||||
char TR;
|
||||
char ID;
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TR;
|
||||
F77_CHAR F77_ID;
|
||||
#else
|
||||
#define F77_TR &TR
|
||||
#define F77_ID &ID
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
|
||||
#else
|
||||
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_ld ld
|
||||
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if ( Order == CblasColMajor ) // CblasColMajor
|
||||
{
|
||||
if ( Trans == CblasNoTrans ) TR = 'N';
|
||||
else if ( Trans == CblasTrans ) TR = 'T';
|
||||
else if ( Trans == CblasConjTrans ) TR = 'T';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Trans setting, %d\n", Trans);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if ( Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TR = C2F_CHAR(&TR);
|
||||
F77_ID = C2F_CHAR(&ID);
|
||||
#endif
|
||||
F77_dgemm_pack( F77_ID,
|
||||
F77_TR,
|
||||
&F77_M,
|
||||
&F77_N,
|
||||
&F77_K,
|
||||
&alpha,
|
||||
src, &F77_ld,
|
||||
dest );
|
||||
}
|
||||
else if ( Order == CblasRowMajor ) // CblasRowMajor
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if ( Trans == CblasNoTrans ) TR = 'T';
|
||||
else if ( Trans == CblasTrans ) TR = 'N';
|
||||
else if ( Trans == CblasConjTrans ) TR = 'N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemm_pack","Invalid Trans setting, %d\n", Trans);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if ( Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemm_pack","Illegal Identifier setting, %d\n", Identifier);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TR = C2F_CHAR(&TR);
|
||||
F77_ID = C2F_CHAR(&ID);
|
||||
#endif
|
||||
F77_dgemm_pack ( F77_ID,
|
||||
F77_TR,
|
||||
&F77_M,
|
||||
&F77_N,
|
||||
&F77_K,
|
||||
&alpha,
|
||||
src, &F77_ld,
|
||||
dest );
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_dgemm_pack", "Invalid Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
83
frame/compat/cblas/src/cblas_dgemm_pack_get_size.c
Normal file
83
frame/compat/cblas/src/cblas_dgemm_pack_get_size.c
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
f77_int cblas_dgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier,
|
||||
const f77_int M,
|
||||
const f77_int N,
|
||||
const f77_int K )
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
|
||||
char ID;
|
||||
f77_int tbytes = 0;
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_ID;
|
||||
#else
|
||||
#define F77_ID &ID
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#endif
|
||||
|
||||
if (Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if (Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla( 1, "cblas_dgemm_pack_get_size",
|
||||
"Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
|
||||
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_ID = C2F_CHAR( &ID );
|
||||
#endif
|
||||
tbytes = F77_dgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
|
||||
|
||||
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
return tbytes;
|
||||
}
|
||||
#endif
|
||||
@@ -202,6 +202,14 @@
|
||||
#define F77_cgemm_batch cgemm_batch
|
||||
#define F77_zgemm_batch zgemm_batch
|
||||
|
||||
// -- Pack-Compute APIs --
|
||||
#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl
|
||||
#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl
|
||||
#define F77_sgemm_pack sgemm_pack_blis_impl
|
||||
#define F77_dgemm_pack dgemm_pack_blis_impl
|
||||
#define F77_sgemm_compute sgemm_compute_blis_impl
|
||||
#define F77_dgemm_compute dgemm_compute_blis_impl
|
||||
|
||||
// (BLIS_ENABLE_NO_UNDERSCORE_API) ends
|
||||
#else
|
||||
/*
|
||||
@@ -389,6 +397,14 @@
|
||||
#define F77_dgemm_batch dgemm_batch_
|
||||
#define F77_cgemm_batch cgemm_batch_
|
||||
#define F77_zgemm_batch zgemm_batch_
|
||||
|
||||
// -- Pack-Compute APIs --
|
||||
#define F77_sgemm_pack_get_size sgemm_pack_get_size_blis_impl
|
||||
#define F77_dgemm_pack_get_size dgemm_pack_get_size_blis_impl
|
||||
#define F77_sgemm_pack sgemm_pack_blis_impl
|
||||
#define F77_dgemm_pack dgemm_pack_blis_impl
|
||||
#define F77_sgemm_compute sgemm_compute_blis_impl
|
||||
#define F77_dgemm_compute dgemm_compute_blis_impl
|
||||
#endif
|
||||
|
||||
#endif /* CBLAS_F77_H */
|
||||
|
||||
171
frame/compat/cblas/src/cblas_sgemm_compute.c
Normal file
171
frame/compat/cblas/src/cblas_sgemm_compute.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
BLIS_EXPORT_BLAS void cblas_sgemm_compute( enum CBLAS_ORDER Order,
|
||||
f77_int TransA,
|
||||
f77_int TransB,
|
||||
const f77_int M,
|
||||
const f77_int N,
|
||||
const f77_int K,
|
||||
const float* A, f77_int lda,
|
||||
const float* B, f77_int ldb,
|
||||
float beta,
|
||||
float* C, f77_int ldc)
|
||||
{
|
||||
char TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
if( Order == CblasColMajor ) // CblasColMajor
|
||||
{
|
||||
if ( TransA == CblasTrans ) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='T';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else if ( TransA == CblasPacked ) TA='P';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemm_compute",
|
||||
"Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( TransB == CblasTrans ) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='T';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else if ( TransB == CblasPacked ) TB='P';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemm_compute",
|
||||
"Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int rs_c = 1;
|
||||
|
||||
F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
|
||||
B, &rs_b, &F77_ldb, &beta, C, &rs_c, &F77_ldc);
|
||||
}
|
||||
else if ( Order == CblasRowMajor ) // CblasRowMajor
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
|
||||
// If Row Major, and A is not already reordered
|
||||
// then toggle the transA parameter and interchange the strides.
|
||||
if ( TransA == CblasPacked ) TA='P';
|
||||
else if ( TransA == CblasTrans ) TA='N';
|
||||
else if ( TransA == CblasNoTrans ) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemm_compute",
|
||||
"Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// If Row Major, and B is not already reordered
|
||||
// then toggle the transB parameter and interchange the strides.
|
||||
if ( TransB == CblasPacked ) TB='P';
|
||||
else if ( TransB == CblasTrans ) TB='N';
|
||||
else if ( TransB == CblasNoTrans ) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemm_compute",
|
||||
"Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
f77_int rs_a = 1;
|
||||
f77_int rs_b = 1;
|
||||
f77_int cs_c = 1;
|
||||
|
||||
F77_sgemm_compute( F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, A, &rs_a, &F77_lda,
|
||||
B, &rs_b, &F77_ldb, &beta, C, &F77_ldc, &cs_c);
|
||||
}
|
||||
else
|
||||
{
|
||||
cblas_xerbla(1, "cblas_sgemm_compute",
|
||||
"Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
157
frame/compat/cblas/src/cblas_sgemm_pack.c
Normal file
157
frame/compat/cblas/src/cblas_sgemm_pack.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
BLIS_EXPORT_BLAS void cblas_sgemm_pack( enum CBLAS_ORDER Order,
|
||||
enum CBLAS_IDENTIFIER Identifier,
|
||||
enum CBLAS_TRANSPOSE Trans,
|
||||
const f77_int M,
|
||||
const f77_int N,
|
||||
const f77_int K,
|
||||
const float alpha,
|
||||
const float* src, const f77_int ld,
|
||||
float* dest )
|
||||
{
|
||||
char TR;
|
||||
char ID;
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TR;
|
||||
F77_CHAR F77_ID;
|
||||
#else
|
||||
#define F77_TR &TR
|
||||
#define F77_ID &ID
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K, F77_ld=ld;
|
||||
#else
|
||||
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_ld ld
|
||||
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if ( Order == CblasColMajor ) // CblasColMajor
|
||||
{
|
||||
if ( Trans == CblasNoTrans ) TR = 'N';
|
||||
else if ( Trans == CblasTrans ) TR = 'T';
|
||||
else if ( Trans == CblasConjTrans ) TR = 'T';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Trans setting, %d\n", Trans);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if ( Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TR = C2F_CHAR(&TR);
|
||||
F77_ID = C2F_CHAR(&ID);
|
||||
#endif
|
||||
F77_sgemm_pack( F77_ID,
|
||||
F77_TR,
|
||||
&F77_M,
|
||||
&F77_N,
|
||||
&F77_K,
|
||||
&alpha,
|
||||
src, &F77_ld,
|
||||
dest );
|
||||
}
|
||||
else if ( Order == CblasRowMajor ) // CblasRowMajor
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if ( Trans == CblasNoTrans ) TR = 'T';
|
||||
else if ( Trans == CblasTrans ) TR = 'N';
|
||||
else if ( Trans == CblasConjTrans ) TR = 'N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemm_pack","Invalid Trans setting, %d\n", Trans);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if ( Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if ( Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemm_pack","Illegal Identifier setting, %d\n", Identifier);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TR = C2F_CHAR(&TR);
|
||||
F77_ID = C2F_CHAR(&ID);
|
||||
#endif
|
||||
F77_sgemm_pack ( F77_ID,
|
||||
F77_TR,
|
||||
&F77_M,
|
||||
&F77_N,
|
||||
&F77_K,
|
||||
&alpha,
|
||||
src, &F77_ld,
|
||||
dest );
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_sgemm_pack", "Invalid Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
83
frame/compat/cblas/src/cblas_sgemm_pack_get_size.c
Normal file
83
frame/compat/cblas/src/cblas_sgemm_pack_get_size.c
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
|
||||
f77_int cblas_sgemm_pack_get_size( enum CBLAS_IDENTIFIER Identifier,
|
||||
const f77_int M,
|
||||
const f77_int N,
|
||||
const f77_int K )
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
|
||||
char ID;
|
||||
f77_int tbytes = 0;
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_ID;
|
||||
#else
|
||||
#define F77_ID &ID
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_N=N, F77_K=K;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#endif
|
||||
|
||||
if ( Identifier == CblasAMatrix ) ID = 'A';
|
||||
else if ( Identifier == CblasBMatrix ) ID = 'B';
|
||||
else
|
||||
{
|
||||
cblas_xerbla( 1, "cblas_sgemm_pack_get_size",
|
||||
"Illegal CBLAS_IDENTIFIER setting, %d\n", Identifier );
|
||||
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_ID = C2F_CHAR( &ID );
|
||||
#endif
|
||||
tbytes = F77_sgemm_pack_get_size ( F77_ID, &F77_M, &F77_N, &F77_K );
|
||||
|
||||
AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_1 );
|
||||
return tbytes;
|
||||
}
|
||||
#endif
|
||||
@@ -1,4 +1,4 @@
|
||||
##Copyright (C) 2020, Advanced Micro Devices, Inc.##
|
||||
##Copyright (C) 2020-23, Advanced Micro Devices, Inc. All rights reserved. ##
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
@@ -23,8 +23,5 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bla_trmv_check.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsm_check.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_trsv_check.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm3m_check.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bla_gemm_compute_check.h
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
87
frame/compat/check/bla_gemm_compute_check.h
Normal file
87
frame/compat/check/bla_gemm_compute_check.h
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define bla_gemm_compute_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, rs_c, cs_c ) \
|
||||
{ \
|
||||
f77_int info = 0; \
|
||||
f77_int nota, notb; \
|
||||
f77_int conja, conjb; \
|
||||
f77_int ta, tb; \
|
||||
f77_int packa, packb; \
|
||||
f77_int nrowa, nrowb; \
|
||||
\
|
||||
nota = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
notb = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
ta = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
tb = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
packa = PASTE_LSAME( transa, "P", (ftnlen)1, (ftnlen)1 ); \
|
||||
packb = PASTE_LSAME( transb, "P", (ftnlen)1, (ftnlen)1 ); \
|
||||
\
|
||||
if ( nota || packa ) { nrowa = *m; } \
|
||||
else { nrowa = *k; } \
|
||||
if ( notb || packb ) { nrowb = *k; } \
|
||||
else { nrowb = *n; } \
|
||||
\
|
||||
if ( !nota && !conja && !ta && !packa ) \
|
||||
info = 1; \
|
||||
else if ( !notb && !conjb && !tb && !packb ) \
|
||||
info = 2; \
|
||||
else if ( *m < 0 ) \
|
||||
info = 3; \
|
||||
else if ( *n < 0 ) \
|
||||
info = 4; \
|
||||
else if ( *k < 0 ) \
|
||||
info = 5; \
|
||||
else if ( !packa && *lda < bli_max( 1, nrowa ) ) /* lda is ignored when A is packed. */ \
|
||||
info = 7; \
|
||||
else if ( !packb && *ldb < bli_max( 1, nrowb ) ) /* ldb is ignored when B is packed. */ \
|
||||
info = 9; \
|
||||
else if ( ( *rs_c == 1 && *cs_c < bli_max( 1, *m ) ) || ( *cs_c == 1 && *rs_c < bli_max( 1, *n ) ) ) \
|
||||
info = 12; \
|
||||
\
|
||||
if ( info != 0 ) \
|
||||
{ \
|
||||
char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
|
||||
\
|
||||
sprintf( func_str, "%s%-5s", dt_str, op_str ); \
|
||||
\
|
||||
bli_string_mkupper( func_str ); \
|
||||
\
|
||||
PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
@@ -287,6 +287,8 @@
|
||||
#define dgemm_batch_ dgemm_batch
|
||||
#define cgemm_batch_ cgemm_batch
|
||||
#define zgemm_batch_ zgemm_batch
|
||||
#define sgemm_compute_ sgemm_compute
|
||||
#define dgemm_compute_ dgemm_compute
|
||||
#define saxpby_ saxpby
|
||||
#define daxpby_ daxpby
|
||||
#define caxpby_ caxpby
|
||||
@@ -391,6 +393,7 @@
|
||||
#define dgbmv DGBMV
|
||||
#define dgemm DGEMM
|
||||
#define dgemm_batch DGEMM_BATCH
|
||||
#define dgemm_compute DGEMM_COMPUTE
|
||||
#define dgemmt DGEMMT
|
||||
#define dgemv DGEMV
|
||||
#define dger DGER
|
||||
@@ -464,6 +467,7 @@
|
||||
#define sgbmv SGBMV
|
||||
#define sgemm SGEMM
|
||||
#define sgemm_batch SGEMM_BATCH
|
||||
#define sgemm_compute SGEMM_COMPUTE
|
||||
#define sgemmt SGEMMT
|
||||
#define sgemv SGEMV
|
||||
#define sger SGER
|
||||
|
||||
@@ -470,7 +470,8 @@ typedef enum
|
||||
BLIS_NO_TRANSPOSE = 0x0,
|
||||
BLIS_TRANSPOSE = BLIS_BITVAL_TRANS,
|
||||
BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ,
|
||||
BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS
|
||||
BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS,
|
||||
BLIS_PACKED = BLIS_BITVAL_PACKED_UNSPEC
|
||||
} trans_t;
|
||||
|
||||
typedef enum
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
target_sources("${PROJECT_NAME}"
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_openmp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_compute_decor_single.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_openmp.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_pthreads.c
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bli_l3_decor_single.c
|
||||
|
||||
67
frame/thread/bli_l3_compute_decor.h
Normal file
67
frame/thread/bli_l3_compute_decor.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_COMPUTE_DECOR_H
|
||||
#define BLIS_L3_COMPUTE_DECOR_H
|
||||
|
||||
// Level-3 compute internal function type.
|
||||
typedef err_t (*l3computeint_t)
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 compute thread decorator prototype.
|
||||
err_t bli_l3_compute_thread_decorator
|
||||
(
|
||||
l3computeint_t func,
|
||||
opid_t family,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
#include "bli_l3_compute_decor_single.h"
|
||||
#include "bli_l3_compute_decor_openmp.h"
|
||||
// #include "bli_l3_compute_decor_pthreads.h"
|
||||
|
||||
#endif
|
||||
133
frame/thread/bli_l3_compute_decor_openmp.c
Normal file
133
frame/thread/bli_l3_compute_decor_openmp.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// @note: Presently MT is not supported, so n_threads have been explicitly
|
||||
// initialized to 1 while intializing. Thus, even if BLIS is build with OpenMP
|
||||
// support, the compute APIs work as an ST implementation.
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
void* bli_l3_compute_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
err_t bli_l3_compute_thread_decorator
|
||||
(
|
||||
l3computeint_t func,
|
||||
opid_t family,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_pba_rntm_set_pba( rntm );
|
||||
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
44
frame/thread/bli_l3_compute_decor_openmp.h
Normal file
44
frame/thread/bli_l3_compute_decor_openmp.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
|
||||
#define BLIS_L3_SUP_DECOR_OPENMP_H
|
||||
|
||||
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
87
frame/thread/bli_l3_compute_decor_single.c
Normal file
87
frame/thread/bli_l3_compute_decor_single.c
Normal file
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
err_t bli_l3_compute_thread_decorator
|
||||
(
|
||||
l3computeint_t func,
|
||||
opid_t family,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
const dim_t n_threads = 1;
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
bli_pba_rntm_set_pba( rntm );
|
||||
|
||||
{
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
const dim_t tid = 0;
|
||||
|
||||
// This optimization allows us to use one of the global thrinfo_t
|
||||
// objects for single-threaded execution rather than grow one from
|
||||
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
|
||||
// from within the variants, will immediately return if it detects
|
||||
// that the thrinfo_t* passed into it is either
|
||||
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
|
||||
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
|
||||
|
||||
( void )tid;
|
||||
|
||||
func
|
||||
(
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
43
frame/thread/bli_l3_compute_decor_single.h
Normal file
43
frame/thread/bli_l3_compute_decor_single.h
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_COMPUTE_DECOR_SINGLE_H
|
||||
#define BLIS_L3_COMPUTE_DECOR_SINGLE_H
|
||||
|
||||
// Definitions specific to situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -54,7 +54,11 @@ void bli_pack_full_thread_decorator
|
||||
/* Ensure n_threads is always greater than or equal to 1 */
|
||||
/* Passing BLIS_IC_NT and BLIS_JC_NT for pack can lead to n_threads */
|
||||
/* becoming negative. In that case, packing is done using 1 thread */
|
||||
n_threads = ( n_threads > 0 ) ? n_threads : 1;
|
||||
// n_threads = ( n_threads > 0 ) ? n_threads : 1;
|
||||
|
||||
// Explicitly setting n_threads = 1 to force packing with only a single
|
||||
// thread.
|
||||
n_threads = 1;
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
|
||||
@@ -60,6 +60,9 @@
|
||||
// Include the pack full thread decorator and related definitions and prototypes
|
||||
// for the pack code path.
|
||||
#include "bli_pack_full_decor.h"
|
||||
// Include the level-3 thread decorator and related definitions and prototypes
|
||||
// for the compute code path.
|
||||
#include "bli_l3_compute_decor.h"
|
||||
|
||||
// Initialization-related prototypes.
|
||||
void bli_thread_init( void );
|
||||
|
||||
Reference in New Issue
Block a user