AOCL DTL - Added thread and execution time details in logs

-- Added number of threads used in DTL logs
    -- Added support for timestamps in DTL traces
    -- Added time taken by API at BLAS layer in the DTL logs
    -- Added GFLOPS achieved in DTL logs
    -- Added support to enable/disable execution time and
       gflops printing for individual API's. We may not want
       it for all API's. Also it will help us migrate API's
       to execution time and gflops logs in stages.
    -- Updated GEMM bench to match new logs
    -- Refactored aocldtl_blis.c to remove code duplication.
    -- Clean up logs generation and reading to use spaces
       consistently to separate various fields.
    -- Updated AOCL_gettid() to return correct thread id
       when using pthreads.

AMD-Internal: [CPUPL-1691]
Change-Id: Iddb8a3be2a5cd624a07ccdbf5ae0695799d8ae8e
This commit is contained in:
Dipal M Zambare
2021-07-07 21:48:05 +05:30
parent a7f600b3a4
commit 8f310c3384
16 changed files with 773 additions and 944 deletions

View File

@@ -5,7 +5,7 @@
* These functions are invoked though macros by
* end user.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc. All rights Reserved.
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
*
*=======================================================================*/
#include "blis.h"
@@ -23,10 +23,23 @@
#endif
#endif
/*
* Client should provide this function, it should return
* number of threads used by the API
*/
extern dim_t AOCL_get_requested_threads_count(void);
/* By default the trace level will be set to ALL User can configure this
parameter at run time using command line argument */
uint32 gui32TraceLogLevel = AOCL_DTL_TRACE_LEVEL;
/*
* Time elapsed in the function will be logged from main thread only,
* we will save the main thread id. This will be compared with the id
* of the logging thread.
*/
AOCL_TID gtidMainThreadID = -1;
/* The user can configure the file name in which he wants to dump the data */
#if AOCL_DTL_TRACE_ENABLE
/* The file name for storing traced log added manually in the code */
@@ -117,6 +130,9 @@ void DTL_Initialize(
}
#endif
/* Save Id for main thread */
gtidMainThreadID = AOCL_gettid();
} /* DTL_Initialize */
#endif
@@ -162,6 +178,7 @@ void DTL_Uninitialize(void)
* pi8FunctionName - Function Name
* ui32LineNumber - Line number
* pi8Message - Message to be printed
*
* Output Parameter(s) : None
* Return parameter(s) : None
*==================================================================*/
@@ -176,6 +193,8 @@ void DTL_Trace(
{
uint8 i = 0;
AOCL_FAL_FILE *pOutFile = NULL;
uint64 u64EventTime = AOCL_getTimestamp();
dim_t u64RequestedThreadsCount = AOCL_get_requested_threads_count();
bli_init_auto();
@@ -226,7 +245,6 @@ void DTL_Trace(
level set while initialization */
if (ui8LogLevel <= gui32TraceLogLevel)
{
/* Indent as per level if is function call trace */
if ((ui8LogLevel >= AOCL_DTL_LEVEL_TRACE_1) &&
(ui8LogLevel <= AOCL_DTL_LEVEL_TRACE_8))
@@ -242,26 +260,39 @@ void DTL_Trace(
switch (ui8LogType)
{
case TRACE_TYPE_FENTRY:
fprintf(pOutFile, "In %s()...\n", pi8FunctionName);
fprintf(pOutFile, "nt=%ld,ts=%ld: In %s()...\n",
u64RequestedThreadsCount,
u64EventTime,
pi8FunctionName);
break;
case TRACE_TYPE_FEXIT:
if (pi8Message == NULL)
{ /* Function returned successfully */
fprintf(pOutFile, "Out of %s()\n", pi8FunctionName);
fprintf(pOutFile, "ts=%ld: Out of %s()\n",
u64EventTime,
pi8FunctionName);
}
else
{ /* Function failed to complete, use message to get error */
fprintf(pOutFile, "Out of %s() with error %s\n", pi8FunctionName, pi8Message);
fprintf(pOutFile, "ts=%ld: Out of %s() with error %s\n",
u64EventTime,
pi8FunctionName,
pi8Message);
}
break;
case TRACE_TYPE_LOG:
fprintf(pOutFile, "%s:%d:%s\n", pi8FileName, ui32LineNumber, pi8Message);
fprintf(pOutFile, "%s %s",
pi8FileName,
pi8Message
);
break;
case TRACE_TYPE_RAW:
fprintf(pOutFile, "%s\n", pi8Message);
fprintf(pOutFile, "%s\n",
pi8Message);
break;
}
fflush(pOutFile);
@@ -407,6 +438,72 @@ void DTL_DumpData(
} /* DTL_DumpData */
#endif
#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_LOG_ENABLE)
void AOCL_DTL_start_perf_timer(void)
{
AOCL_TID current_thread = AOCL_gettid();
// Automatic duration calulation is currently
// supported from main thread only, in other words
// at BLAS interface.
if (current_thread != gtidMainThreadID) {
return;
}
AOCL_FLIST_Node *pFileNode = AOCL_FLIST_GetNode(gpLogFileList, current_thread);
if (NULL == pFileNode) {
/* It might be the first call from the current thread, try to create
new trace for this thread. */
AOCL_FAL_FILE *pOutFile = AOCL_FLIST_AddFile(pchDTL_LOG_FILE, &gpLogFileList, current_thread);
if (NULL == pOutFile)
{
AOCL_DEBUGPRINT("File does not exists to dump the trace data \n");
return;
} else {
pFileNode = AOCL_FLIST_GetNode(gpLogFileList, current_thread);
}
}
pFileNode->u64SavedTimeStamp = AOCL_getTimestamp();
fflush(stdout);
}
uint64 AOCL_DTL_get_time_spent(void)
{
AOCL_TID current_thread = AOCL_gettid();
// Automatic duration calulation is currently
// supported from main thread only, in other words
// at BLAS interface.
if (current_thread != gtidMainThreadID) {
return 0;
}
uint64 u64CurrentTimeStamp = AOCL_getTimestamp();
AOCL_FLIST_Node *pFileNode = AOCL_FLIST_GetNode(gpLogFileList, AOCL_gettid());
if (NULL == pFileNode) {
/* It might be the first call from the current thread, try to create
new trace for this thread. */
AOCL_FAL_FILE *pOutFile = AOCL_FLIST_AddFile(pchDTL_LOG_FILE, &gpLogFileList, AOCL_gettid());
if (NULL == pOutFile)
{
AOCL_DEBUGPRINT("File does not exists to dump the trace data \n");
return 0;
} else {
pFileNode = AOCL_FLIST_GetNode(gpLogFileList, AOCL_gettid());
}
}
return (u64CurrentTimeStamp - pFileNode->u64SavedTimeStamp);
}
#endif
/* This is enabled by passing ETRACE_ENABLE=1 to make */
#ifdef AOCL_DTL_AUTO_TRACE_ENABLE

View File

@@ -1,12 +1,12 @@
/*===================================================================
* File Name : aocldtl.h
*
*
* Description : This is main interface file for the end user
* It provides defination for all macros to be
* It provides defination for all macros to be
* used by user to add debug/trace information.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
*
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
#ifndef _AOCLDTL_H_
@@ -47,7 +47,7 @@
#endif
#if AOCL_DTL_TRACE_ENABLE
/* Exit macro to trace the flow of control The parameter LogLevel specifies
/* Exit macro to trace the flow of control The parameter LogLevel specifies
log level String will preferably contains the function name in which this
macro is invoked */
#define AOCL_DTL_TRACE_EXIT(LogLevel) \
@@ -72,8 +72,8 @@
#endif
#if AOCL_DTL_DUMP_ENABLE
/* Macro to Dump the DATA The parameters Buffer contains the data to be
dumped BufferSize specifies the no. of bytes to be dumped DataType
/* Macro to Dump the DATA The parameters Buffer contains the data to be
dumped BufferSize specifies the no. of bytes to be dumped DataType
specifies the data type of Buffer */
#define AOCL_DTL_DUMP(LogLevel, Buffer, BufferSize, DataType, String, OutputType) \
/* Call the Dump function to Dump the DATA */ \
@@ -103,6 +103,19 @@
#define AOCL_DTL_LOG(LogLevel, Message)
#endif
#if AOCL_DTL_LOG_ENABLE
void AOCL_DTL_start_perf_timer(void);
uint64 AOCL_DTL_get_time_spent(void);
/* Macro to log the Data */
#define AOCL_DTL_START_PERF_TIMER() \
AOCL_DTL_start_perf_timer()
#else
/* Dummy macro definition if the AOCL_DTL_LOG_ENABLE macro is not enabled */
#define AOCL_DTL_START_PERF_TIMER()
#endif
/* Macro to initialize the prerequisite for debuging */
#ifdef AOCL_DTL_INITIALIZE_ENABLE
#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL) \

File diff suppressed because it is too large Load Diff

View File

@@ -14,22 +14,29 @@
#include "blis.h"
#if AOCL_DTL_LOG_ENABLE
dim_t AOCL_get_requested_threads_count(void);
void AOCL_DTL_log_gemm_sizes(int8 loglevel,
char dt,
char dt_type,
const f77_char transa,
const f77_char transb,
const f77_int m,
const f77_int n,
const f77_int k,
const void* alpha,
const void *alpha,
const f77_int lda,
const f77_int ldb,
const void* beta,
const void *beta,
const f77_int ldc,
const char* filename,
const char* functionn_name,
const char *filename,
const char *function_name,
int line);
void AOCL_DTL_log_gemm_stats(int8 loglevel,
const f77_int m,
const f77_int n,
const f77_int k);
void AOCL_DTL_log_trsm_sizes(int8 loglevel,
char dt,
f77_char side,
@@ -376,9 +383,13 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
const char* function_name,
int line);
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc) \
AOCL_DTL_log_gemm_sizes(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc, __FILE__, __FUNCTION__, __LINE__);
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k) \
AOCL_DTL_log_gemm_stats(loglevel, m, n, k);
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb) \
AOCL_DTL_log_trsm_sizes(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb, __FILE__, __FUNCTION__, __LINE__);
@@ -487,6 +498,8 @@ void AOCL_DTL_log_trmm_sizes(int8 loglevel,
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc)
#define AOCL_DTL_LOG_GEMM_STATS(loglevel, m, n, k)
#define AOCL_DTL_LOG_TRSM_INPUTS(loglevel, dt, side, uploa, transa, diaga, m, n, alpha, lda, ldb)
#define AOCL_DTL_LOG_GEMMT_INPUTS(loglevel, dt, uplo, transa, transb, n, k, alpha, lda, ldb, beta, ldc)

View File

@@ -1,12 +1,12 @@
/*===================================================================
* File Name : aoclflist.c
*
* Description : Linked list of open files assocaited with
*
* Description : Linked list of open files assocaited with
* each thread. This is used to log the data
* to correct file as per the current thread id.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
*
*
*==================================================================*/
#include "aocltpdef.h"
@@ -16,7 +16,7 @@
#include "aoclos.h"
/* Disable instrumentation for following function, since they are called from
/* Disable instrumentation for following function, since they are called from
* Auto Generated execution trace handlers. */
Bool AOCL_FLIST_IsEmpty(
AOCL_FLIST_Node *plist) __attribute__((no_instrument_function));
@@ -45,6 +45,35 @@ Bool AOCL_FLIST_IsEmpty(AOCL_FLIST_Node *plist)
} /* AOCL_FLIST_IsEmpty */
AOCL_FLIST_Node * AOCL_FLIST_GetNode(AOCL_FLIST_Node *plist, AOCL_TID tid)
{
AOCL_FLIST_Node *temp;
if (AOCL_FLIST_IsEmpty(plist) == 1)
{
return NULL;
}
temp = plist;
/* if list is not empty search for the file handle in all nodes */
while (temp != NULL)
{
if (temp->tid == tid)
{
if (temp->fp == NULL)
{
AOCL_DEBUGPRINT("Could not get saved time stamp for thread = %d", tid);
}
return temp;
}
temp = temp->pNext;
}
return NULL;
} /* AOCL_FLIST_GetNode */
AOCL_FAL_FILE *AOCL_FLIST_GetFile(AOCL_FLIST_Node *plist, AOCL_TID tid)
{
AOCL_FLIST_Node *temp;
@@ -89,7 +118,7 @@ AOCL_FAL_FILE *AOCL_FLIST_AddFile(const int8 *pchFilePrefix, AOCL_FLIST_Node **p
}
/* We don't have exiting file, lets try to open new one */
sprintf(pchFileName, "P%d_T%d_%s", AOCL_getpid(), tid, pchFilePrefix);
sprintf(pchFileName, "P%d_T%u_%s", AOCL_getpid(), tid, pchFilePrefix);
file = AOCL_FAL_Open(pchFileName, "wb");
if (file == NULL)
@@ -108,6 +137,7 @@ AOCL_FAL_FILE *AOCL_FLIST_AddFile(const int8 *pchFilePrefix, AOCL_FLIST_Node **p
newNode->pNext = NULL;
newNode->tid = tid;
newNode->u64SavedTimeStamp = AOCL_getTimestamp();
newNode->fp = file;
if (AOCL_FLIST_IsEmpty(*plist) == 1)

View File

@@ -1,12 +1,12 @@
/*===================================================================
* File Name : aoclflist.h
*
* Description : Linked list of open files assocaited with
*
* Description : Linked list of open files assocaited with
* each thread. This is used to log the deta
* to correct file as per the current thread id.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
*
*
*==================================================================*/
#ifndef _AOCL_FLIST_H_
@@ -19,12 +19,17 @@ typedef struct AOCL_FLIST_Node_t
{
AOCL_TID tid;
AOCL_FAL_FILE *fp;
uint64 u64SavedTimeStamp;
struct AOCL_FLIST_Node_t *pNext;
} AOCL_FLIST_Node;
Bool AOCL_FLIST_IsEmpty(
AOCL_FLIST_Node *plist);
AOCL_FLIST_Node * AOCL_FLIST_GetNode(
AOCL_FLIST_Node *plist,
AOCL_TID tid);
AOCL_FAL_FILE *AOCL_FLIST_GetFile(
AOCL_FLIST_Node *plist,
AOCL_TID tid);

View File

@@ -19,7 +19,7 @@
#include <omp.h>
#endif
// BLIS TODO: This is workaround to check if BLIS is built with
// BLIS TODO: This is workaround to check if BLIS is built with
// openmp support. Ideally we dont' want any library
// specific code in dtl.
#include <blis.h>
@@ -36,19 +36,23 @@
*/
uint32 AOCL_gettid(void) __attribute__((no_instrument_function));
AOCL_TID AOCL_gettid(void) __attribute__((no_instrument_function));
pid_t AOCL_getpid(void) __attribute__((no_instrument_function));
uint64 AOCL_getTimestamp(void) __attribute__((no_instrument_function));
uint32 AOCL_gettid(void)
AOCL_TID AOCL_gettid(void)
{
#ifdef BLIS_ENABLE_OPENMP
return omp_get_thread_num();
#else
return 0; // will not work for pthread-based parallelization
#ifdef BLIS_ENABLE_PTHREADS
return pthread_self();
#else
return 0;
#endif
#endif
}
pid_t AOCL_getpid(void)
@@ -63,7 +67,7 @@ uint64 AOCL_getTimestamp(void)
/* The C11 way */
if (clock_gettime(CLOCK_REALTIME, &tms))
{
return -1;
return -1;
}
/* seconds, multiplied with 1 million */
@@ -73,13 +77,13 @@ uint64 AOCL_getTimestamp(void)
/* round up if necessary */
if (tms.tv_nsec % 1000 >= 500)
{
++micros;
++micros;
}
return micros;
}
#else /* Non linux support */
uint32 AOCL_gettid(void)
AOCL_TID AOCL_gettid(void)
{
/* stub for other os's */
return 0;

View File

@@ -1,11 +1,11 @@
/*===================================================================
* File Name : aocltpdef.h
*
*
* Description : Abstraction for various datatypes used by DTL.
*
* Copyright (C) 2020, Advanced Micro Devices, Inc
*
* Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
*
*==================================================================*/
#ifndef AOCL_TYPEDEF_H_
#define AOCL_TYPEDEF_H_

View File

@@ -57,15 +57,15 @@
#define AOCL_MATRIX_INITIALISATION
#define BUFFER_SIZE 256
/* For BLIS since logs are collected at BLAS interfaces
* we disable cblas interfaces for this benchmark application
*/
#ifdef BLIS_ENABLE_CBLAS
//#define CBLAS
#endif
#ifdef BLIS_ENABLE_CBLAS
//#define CBLAS
#endif
int main( int argc, char** argv )
{
@@ -110,26 +110,36 @@ int main( int argc, char** argv )
exit(1);
}
fprintf(fout, "Dt m\t n\t k\t lda\t ldb\t ldc\t rs_a rs_b rs_c transa transb \
alphaR\t alphaI\t betaR\t betaI\t gflops\n");
fprintf(fout, "Dt transa transb m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
// Following variables are needed for scanf to read inputs properly
// however they are not used in bench.
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
char dummy_buffer[BUFFER_SIZE];
// Variables extracted from the logs which are used by bench
char stor_scheme, transA_c, transB_c;
double alpha_r, beta_r, alpha_i, beta_i;
dim_t m_trans, n_trans;
char tmp[256]; // to store function name, line no present in logs.
dim_t rs_a, rs_b, rs_c;
dim_t cs_a, cs_b, cs_c;
inc_t lda, ldb, ldc;
stor_scheme = 'C'; // since logs are collected at BLAS APIs
stor_scheme = 'C'; // By default set it to Column Major
while (fscanf(fin, "%s %c %ld %ld %ld %ld %ld %ld %ld %ld %ld %c %c %lf %lf %lf %lf\n",
tmp, &dt_ch, &m, &n, &k, &cs_a, &cs_b, &cs_c, &rs_a, &rs_b, &rs_c,
&transA_c, &transB_c, &alpha_r, &alpha_i, &beta_r, &beta_i) == 17)
//{S, D, C, Z} transa, transb, m, n, k, alpha_real, alpha_imag, lda ldb
// beta_real, beta_imag, ldc,
//
// number of threads, execution time, gflops ---> ignored by bench
while (fscanf(fin, "%s %c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld[^\n]",
api_name, &dt_ch, &transA_c, &transB_c, &m, &n, &k, &alpha_r, &alpha_i,
&lda, &ldb, &beta_r, &beta_i, &ldc) == 14)
{
if(cs_a==1 && cs_b==1 && cs_c==1) stor_scheme = 'R';
if(rs_a==1 && rs_b==1 && rs_c==1) stor_scheme = 'C';
// Discard any extra data on current line in the input file.
fgets(dummy_buffer, BUFFER_SIZE, fin );
// At BLAS level only column major order is supported.
stor_scheme = 'C';
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
else if (dt_ch == 'Z' || dt_ch == 'z') dt = BLIS_DCOMPLEX;
@@ -164,10 +174,7 @@ int main( int argc, char** argv )
if( (stor_scheme == 'C') || (stor_scheme == 'c') )
{
// Column storage
lda = cs_a; ldb = cs_b; ldc = cs_c;
// leading dimension should be greater than number of rows
// leading dimension should be greater than number of rows
// if ((m > lda) || (k > ldb) || (m > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
@@ -190,14 +197,12 @@ int main( int argc, char** argv )
}
else if( (stor_scheme == 'r') || (stor_scheme == 'R') )
{
// Row-major order
lda = rs_a; ldb = rs_b; ldc = rs_c;
//leading dimension should be greater than number of columns
//if ((k > lda) || (n > ldb) || (n > ldc)) continue;
// Since this bench app is run on logs generated by AOCL trace logs
// - we have relaxed the checks on the input parameters.
// if A is transpose - A(k x lda), lda >= max(1,m)
// if A is transpose - A(k x lda), lda >= max(1,m)
// if A is non-transpose - A (m x lda), lda >= max(1,k)
// if B is transpose - B (n x ldb), ldb >= max(1,k)
// if B is non-transpose - B (k x ldb ), ldb >= max(1,n)
@@ -228,7 +233,7 @@ int main( int argc, char** argv )
}
#endif
#endif
#ifdef AOCL_MATRIX_INITIALISATION
bli_randm( &a );
bli_randm( &b );
@@ -474,9 +479,8 @@ int main( int argc, char** argv )
(unsigned long)n,
(unsigned long)k, gflops);
fprintf (fout, "%c %ld\t %ld\t %ld\t %ld\t %ld\t %ld\t %ld %ld %ld %c %c %lf\t %lf\t %lf\t %lf\t %6.3f\n", \
dt_ch, m, n, k, lda, ldb, ldc, rs_a, rs_b, rs_c, \
transA_c, transB_c, alpha_r, alpha_i, beta_r, beta_i, gflops);
fprintf (fout, "%c %c %c %ld %ld %ld %lf %lf %ld %ld %lf %lf %ld %6.3f\n", \
dt_ch, transA_c, transB_c, m, n, k, alpha_r, alpha_i, lda, ldb, beta_r, beta_i, ldc, gflops);
fflush(fout);

View File

@@ -1,18 +1,32 @@
bli_gemm_ex:125: D 173 23 1 173 174 174 1 1 1 t n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 173 23 1 1 1 1 1 23 23 t n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 173 23 1 1 1 1 1 23 23 n t -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 83 23 1 83 84 84 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 41 2 1 41 42 42 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 77 8 1 77 78 78 1 1 1 n t -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 77 8 1 77 78 78 1 1 1 n n -2.000000 0.000000 3.000000 0.000000
bli_gemm_ex:125: D 41 5 1 41 42 42 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 41 5 1 41 42 42 1 1 1 t n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 65 8 1 65 66 66 1 1 1 n n -3.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 53 8 1 53 54 54 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 68 8 1 68 69 69 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 41 5 1 41 42 42 1 1 1 n t -1.000000 0.000000 2.000000 0.000000
bli_gemm_ex:125: D 41 5 1 41 42 42 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 53 5 1 53 54 54 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 95 14 1 95 96 96 1 1 1 t n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 110 17 1 1 1 1 1 17 17 n n -1.000000 0.000000 1.000000 0.000000
bli_gemm_ex:125: D 95 14 1 95 96 96 1 1 1 n n -1.000000 0.000000 1.000000 0.000000
dgemm_ D N N 1000 3000 2000 0.900000 0.000000 4000 5000 -1.100000 0.000000 6000 nt=4 1542.854 ms 7.778 GFLOPS
dgemm_ D N N 100 100 100 0.900000 0.000000 104 104 -1.100000 0.000000 104 nt=4 0.307 ms 6.515 GFLOPS
dgemm_ D N N 500 500 500 0.900000 0.000000 504 504 -1.100000 0.000000 504 nt=4 32.442 ms 7.706 GFLOPS
dgemm_ D N N 900 900 900 0.900000 0.000000 904 904 -1.100000 0.000000 904 nt=4 172.170 ms 8.468 GFLOPS
dgemm_ D N N 1300 1300 1300 0.900000 0.000000 1304 1304 -1.100000 0.000000 1304 nt=4 655.381 ms 6.704 GFLOPS
dgemm_ D N T 1700 1700 1700 0.900000 0.000000 1704 1704 -1.100000 0.000000 1704 nt=4 1302.928 ms 7.541 GFLOPS
dgemm_ D T N 2100 2100 2100 0.900000 0.000000 2104 2104 -1.100000 0.000000 2104 nt=4 3278.541 ms 5.649 GFLOPS
dgemm_ D T T 2500 2500 2500 0.900000 0.000000 2504 2504 -1.100000 0.000000 2504 nt=4 5292.842 ms 5.904 GFLOPS
zgemm_ Z N N 1000 3000 2000 0.900000 0.000000 4000 5000 -1.100000 0.000000 6000 nt=4 300.940 ms 159.500 GFLOPS
zgemm_ Z N N 100 100 100 0.900000 0.000000 104 104 -1.100000 0.000000 104 nt=4 0.748 ms 10.695 GFLOPS
zgemm_ Z N N 500 500 500 0.900000 0.000000 504 504 -1.100000 0.000000 504 nt=4 8.618 ms 116.036 GFLOPS
zgemm_ Z N N 900 900 900 0.900000 0.000000 904 904 -1.100000 0.000000 904 nt=4 42.717 ms 136.526 GFLOPS
zgemm_ Z N N 1300 1300 1300 0.900000 0.000000 1304 1304 -1.100000 0.000000 1304 nt=4 124.652 ms 141.001 GFLOPS
zgemm_ Z N T 1700 1700 1700 0.900000 0.000000 1704 1704 -1.100000 0.000000 1704 nt=4 277.029 ms 141.877 GFLOPS
zgemm_ Z T N 2100 2100 2100 0.900000 0.000000 2104 2104 -1.100000 0.000000 2104 nt=4 494.360 ms 149.866 GFLOPS
zgemm_ Z T T 2500 2500 2500 0.900000 0.000000 2504 2504 -1.100000 0.000000 2504 nt=4 803.699 ms 155.531 GFLOPS
cgemm_ C N N 1000 3000 2000 0.900000 0.000000 4000 5000 -1.100000 0.000000 6000 nt=4 135.321 ms 354.712 GFLOPS
cgemm_ C N N 100 100 100 0.900000 0.000000 104 104 -1.100000 0.000000 104 nt=4 0.429 ms 18.648 GFLOPS
cgemm_ C N N 500 500 500 0.900000 0.000000 504 504 -1.100000 0.000000 504 nt=4 5.045 ms 198.216 GFLOPS
cgemm_ C N N 900 900 900 0.900000 0.000000 904 904 -1.100000 0.000000 904 nt=4 20.003 ms 291.556 GFLOPS
cgemm_ C N N 1300 1300 1300 0.900000 0.000000 1304 1304 -1.100000 0.000000 1304 nt=4 56.253 ms 312.446 GFLOPS
cgemm_ C N T 1700 1700 1700 0.900000 0.000000 1704 1704 -1.100000 0.000000 1704 nt=4 116.948 ms 336.081 GFLOPS
cgemm_ C T N 2100 2100 2100 0.900000 0.000000 2104 2104 -1.100000 0.000000 2104 nt=4 207.581 ms 356.911 GFLOPS
cgemm_ C T T 2500 2500 2500 0.900000 0.000000 2504 2504 -1.100000 0.000000 2504 nt=4 346.031 ms 361.239 GFLOPS
sgemm_ S N N 1000 3000 2000 0.900000 0.000000 4000 5000 -1.100000 0.000000 6000 nt=4 1024.360 ms 11.715 GFLOPS
sgemm_ S N N 100 100 100 0.900000 0.000000 104 104 -1.100000 0.000000 104 nt=4 0.362 ms 5.525 GFLOPS
sgemm_ S N N 500 500 500 0.900000 0.000000 504 504 -1.100000 0.000000 504 nt=4 1.688 ms 148.104 GFLOPS
sgemm_ S N N 900 900 900 0.900000 0.000000 904 904 -1.100000 0.000000 904 nt=4 147.791 ms 9.865 GFLOPS
sgemm_ S N N 1300 1300 1300 0.900000 0.000000 1304 1304 -1.100000 0.000000 1304 nt=4 451.156 ms 9.739 GFLOPS
sgemm_ S N T 1700 1700 1700 0.900000 0.000000 1704 1704 -1.100000 0.000000 1704 nt=4 873.577 ms 11.248 GFLOPS
sgemm_ S T N 2100 2100 2100 0.900000 0.000000 2104 2104 -1.100000 0.000000 2104 nt=4 1699.278 ms 10.900 GFLOPS
sgemm_ S T T 2500 2500 2500 0.900000 0.000000 2504 2504 -1.100000 0.000000 2504 nt=4 2651.917 ms 11.784 GFLOPS

View File

@@ -1,21 +0,0 @@
Dt n incx incy gflops
isamax_:183: S 100 1 29 0.043
isamax_:183: S 200 1 65 0.065
isamax_:183: S 300 1 185 0.078
isamax_:183: S 400 1 86 0.261
isamax_:183: S 500 1 271 0.279
idamax_:183: D 100 1 64 0.099
idamax_:183: D 200 1 175 0.131
idamax_:183: D 300 1 102 0.148
idamax_:183: D 400 1 249 0.157
idamax_:183: D 500 1 197 0.165
icamax_:183: C 100 1 1 0.185
icamax_:183: C 200 1 108 0.242
icamax_:183: C 300 1 76 0.271
icamax_:183: C 400 1 178 0.283
icamax_:183: C 500 1 403 0.304
izamax_:183: Z 100 1 51 0.178
izamax_:183: Z 200 1 175 0.232
izamax_:183: Z 300 1 240 0.260
izamax_:183: Z 400 1 108 0.293
izamax_:183: Z 500 1 411 0.294

View File

@@ -46,7 +46,6 @@ err_t bli_gemmsup
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2);
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_2, alpha, a, b, beta, c);
// Return early if small matrix handling is disabled at configure-time.
#ifdef BLIS_DISABLE_SUP_HANDLING

View File

@@ -46,7 +46,6 @@ err_t bli_gemmsup_ref
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_3, alpha, a, b, beta, c);
// This function implements the default gemmsup handler. If you are a
// BLIS developer and wish to use a different gemmsup handler, please
// register a different function pointer in the context in your

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -48,7 +48,6 @@ void bli_gemm_front
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_3, alpha, a, b, beta, c);
bli_init_once();
obj_t a_local;

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -54,8 +54,7 @@ void bli_gemm_int
gemm_var_oft f;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
// AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );

View File

@@ -65,9 +65,12 @@ void PASTEF77(ch,blasname) \
inc_t rs_b, cs_b; \
inc_t rs_c, cs_c; \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
/* Initialize BLIS. */ \
bli_init_auto(); \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
@@ -118,6 +121,7 @@ void PASTEF77(ch,blasname) \
NULL \
); \
\
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -142,18 +146,20 @@ void PASTEF77(ch,blasname) \
ftype* c, const f77_int* ldc \
) \
{ \
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, (void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
\
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(ch), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
@@ -217,6 +223,7 @@ void PASTEF77(ch,blasname) \
NULL \
); \
} \
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
return; \
} \
else if( m0 == 1 ) \
@@ -249,6 +256,7 @@ void PASTEF77(ch,blasname) \
NULL \
); \
} \
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
return; \
} \
\
@@ -284,7 +292,8 @@ void PASTEF77(ch,blasname) \
NULL \
); \
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
@@ -306,15 +315,19 @@ void dgemm_
double* c, const f77_int* ldc
)
{
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *transa, *transb, *m, *n, *k, (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO)
/* Initialize BLIS. */
bli_init_auto();
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
@@ -358,7 +371,8 @@ void dgemm_
(double*)beta,
c, *ldc
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS */
bli_finalize_auto();
@@ -395,6 +409,9 @@ void dgemm_
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
else if (m0 == 1)
@@ -427,6 +444,7 @@ void dgemm_
((void*)0)
);
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
@@ -478,8 +496,9 @@ void dgemm_
NULL,
NULL
);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
@@ -519,7 +538,8 @@ void dgemm_
if (status == BLIS_SUCCESS)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
@@ -532,7 +552,8 @@ void dgemm_
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if (status == BLIS_SUCCESS)
{
return;
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
return;
}
// fall back on native path when dgemm is not handled in sup path.
@@ -550,7 +571,8 @@ void dgemm_
/* NULL */
/* ); */
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
} // end of dgemm_
@@ -569,15 +591,16 @@ void zgemm_
dcomplex* c, const f77_int* ldc
)
{
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *transb, *m, *n, *k, (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO)
/* Initialize BLIS. */
bli_init_auto();
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
@@ -655,11 +678,12 @@ void zgemm_
NULL
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
bli_finalize_auto();
return;
}
// The code below will be called when number of threads = 1.
#if ENABLE_INDUCED_METHOD
@@ -686,7 +710,8 @@ void zgemm_
//sqp algo is found better for n > 40
if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
@@ -699,17 +724,20 @@ void zgemm_
err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if(status==BLIS_SUCCESS)
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
}
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
bli_finalize_auto();
}// end of zgemm_
@@ -738,15 +766,16 @@ void dzgemm_
)
{
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', *transa, *transb, *m, *n, *k, (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
trans_t blis_transa;
trans_t blis_transb;
dim_t m0, n0, k0;
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO)
/* Initialize BLIS. */
bli_init_auto();
/* Initialize BLIS. */
bli_init_auto();
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
(void*)alpha, *lda, *ldb, (void*)beta, *ldc);
/* Perform BLAS parameter checking. */
PASTEBLACHK(gemm)
@@ -808,7 +837,8 @@ void dzgemm_
// fall back on native path when zgemm is not handled in sup path.
bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
bli_finalize_auto();
}// end of dzgemm_