DTL Log update

* DTL Log update
Updates logs with nt and AOCL Dynamic selected nt for axpy, scal and dgemv
Modified bench_gemv.c to able to process modified dtl logs.

* Updated DTL log for copy routine with actual nt and dynamic nt

* Refactor OpenMP pragmas and clean up code

Removed unnecessary nested OpenMP pragma and cleaned up function end comment.

* Fixed DTL log for sequential build

* Added thread logging in bla_gemv_check for invalid inputs

---------

Co-authored-by: Smyth, Edward <Edward.Smyth@amd.com>
This commit is contained in:
Varaganti, Kiran
2025-09-22 11:32:00 +05:30
committed by GitHub
parent 03685d1ad9
commit 807de2a990
13 changed files with 1541 additions and 1442 deletions

View File

@@ -82,6 +82,17 @@ void DTL_get_complex_parts(char dt_type,
}
}
void AOCL_DTL_log_num_threads(int8 loglevel,
dim_t num_threads
)
{
char buffer[256];
sprintf(buffer, " nt=%ld dynamic_nt=%ld", AOCL_get_requested_threads_count(), num_threads);
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
}
// Level-3 Logging
void AOCL_DTL_log_gemm_sizes(int8 loglevel,
@@ -706,9 +717,18 @@ void AOCL_DTL_log_gemv_sizes(int8 loglevel,
DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag);
// {S, D,C, Z} { transa, m, n, alpha, lda, incx, beta, incy}
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
if (dt_type == 'd' || dt_type == 'D' )
{
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld", tolower(dt_type),
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
}
else
{
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
}
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}
@@ -1499,7 +1519,7 @@ void AOCL_DTL_log_axpy_sizes(int8 loglevel,
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
// {S, D, C, Z} {n, alpha_real, alpha_imag, incx, incy}
sprintf(buffer, "%c %ld %lf %lf %ld %ld\n", tolower(dt_type),
sprintf(buffer, "%c %ld %lf %lf %ld %ld", tolower(dt_type),
(dim_t)n, alpha_real, alpha_imag, (dim_t)incx, (dim_t)incy);
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
@@ -1516,7 +1536,7 @@ void AOCL_DTL_log_copy_sizes(int8 loglevel,
{
char buffer[256];
// {S, D, C, Z} {n, incx, incy}
sprintf(buffer, "%c %ld %ld %ld\n", tolower(dt_type), (dim_t)n, (dim_t)incx, (dim_t)incy);
sprintf(buffer, "%c %ld %ld %ld", tolower(dt_type), (dim_t)n, (dim_t)incx, (dim_t)incy);
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}
@@ -1599,8 +1619,8 @@ void AOCL_DTL_log_scal_sizes(int8 loglevel,
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
// {S, D, C, Z} { alpha, n, incx}
sprintf(buffer, "%c %lf %lf %ld %ld\n", tolower(dt_type),
alpha_real, alpha_imag, (dim_t)n, (dim_t)incx);
sprintf(buffer, "%c %lf %lf %ld %ld", tolower(dt_type),
alpha_real, alpha_imag, (dim_t)n, (dim_t)incx);
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}

View File

@@ -46,6 +46,10 @@
#if AOCL_DTL_LOG_ENABLE
dim_t AOCL_get_requested_threads_count(void);
void AOCL_DTL_log_num_threads(int8 loglevel,
dim_t num_threads
);
// Level-3 Logging
void AOCL_DTL_log_gemm_sizes(int8 loglevel,
@@ -760,6 +764,11 @@ void AOCL_DTL_log_matcopy2_sizes(int8 loglevel,
const char* function_name,
int line);
#define AOCL_DTL_LOG_NUM_THREADS(loglevel, num_threads) \
if (tlIsLoggingEnabled) \
AOCL_DTL_log_num_threads(loglevel, num_threads);
// Level-3 Macros
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc) \
@@ -1059,6 +1068,8 @@ void AOCL_DTL_log_matcopy2_sizes(int8 loglevel,
#else // AOCL_DTL_LOG_ENABLE
#define AOCL_DTL_LOG_NUM_THREADS(loglevel, num_threads)
// Level-3 Macros
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc)

View File

@@ -48,6 +48,7 @@
#define AOCL_MATRIX_INITIALISATION
#define BUFFER_SIZE 256
//#define BLIS_ENABLE_CBLAS
@@ -100,6 +101,11 @@ int main( int argc, char** argv )
exit(1);
}
if (argc > 3)
{
n_repeats = atoi(argv[3]);
}
fprintf(fout, "Func Dt transa m n alphaR alphaI lda incx betaR betaI incy gflops\n");
char transA;
@@ -109,14 +115,21 @@ int main( int argc, char** argv )
inc_t lda;
inc_t incx;
inc_t incy;
char tmp[256]; // to store function name, line no present in logs.
//char tmp[256]; // to store function name, line no present in logs.
// Following variables are needed for scanf to read inputs properly
// however they are not used in bench.
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
char dummy_buffer[BUFFER_SIZE];
// {S,D,C,Z} {transa m n alpha lda, incx, beta, incy}
while (fscanf(fin, "%s %c %c " INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS "\n",
tmp, &dt_ch, &transA, &m, &n, &alpha_r, &alpha_i, &lda,\
while (fscanf(fin, "%s %c %c " INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
api_name, &dt_ch, &transA, &m, &n, &alpha_r, &alpha_i, &lda,\
&incx, &beta_r, &beta_i, &incy) == 12)
{
// Discard any extra data on current line in the input file.
fgets(dummy_buffer, BUFFER_SIZE, fin );
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
else if (dt_ch == 'Z' || dt_ch == 'z') dt = BLIS_DCOMPLEX;
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
@@ -398,7 +411,7 @@ int main( int argc, char** argv )
gflops);
fprintf (fout, "%s %c %c" INT_FS INT_FS " %lf %lf" INT_FS INT_FS " %lf %lf " INT_FS " %6.3f\n",
tmp, dt_ch, transA, m, n, alpha_r, alpha_i, lda,\
api_name, dt_ch, transA, m, n, alpha_r, alpha_i, lda,\
incx, beta_r, beta_i, incy, gflops);
fflush(fout);

View File

@@ -1,3 +1,4 @@
dgemv_blis_impl d N 8 5 0.000000 0.000000 9 1 0.000000 0.000000 2 nt=192 dynamic_nt=1
dgemv_:173: D N 1 14 -1.000000 0.000000 10000 1 1.000000 0.000000 1
dgemv_:173: D N 1 14 -1.000000 0.000000 30000 1 1.000000 0.000000 1
dgemv_:173: D N 1 2 -1.000000 0.000000 100 1 1.000000 0.000000 1

View File

@@ -247,6 +247,7 @@ void bli_dgemv_unf_var1
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -333,6 +334,7 @@ void bli_dgemv_unf_var1
gemv_kr_ptr = bli_dgemv_t_zen_int; // DGEMV
scalv_kr_ptr = bli_dscalv_zen_int; // DSCALV
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
fast_path_thresh = 13000;
#endif
@@ -383,10 +385,11 @@ void bli_dgemv_unf_var1
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
// If alpha is zero, the GEMV operation is reduced to y := beta * y, thus,
// y is only scaled by beta and returned.
if( bli_deq0( *alpha ) )
@@ -401,6 +404,7 @@ void bli_dgemv_unf_var1
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -491,6 +495,8 @@ void bli_dgemv_unf_var1
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
#if defined(BLIS_ENABLE_OPENMP)
}
else
@@ -568,6 +574,7 @@ void bli_dgemv_unf_var1
cntx
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
}
#endif
}
@@ -587,6 +594,7 @@ void bli_dgemv_unf_var1
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
}
// If x was packed into x_temp, free the memory.

View File

@@ -342,7 +342,7 @@ void bli_dgemv_unf_var2 (
y, incy,
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
@@ -477,7 +477,8 @@ void bli_dgemv_unf_var2 (
if( bli_deq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -521,7 +522,7 @@ void bli_dgemv_unf_var2 (
// Return the buffer to pool
bli_pba_release(&rntm , &mem_bufY);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}

File diff suppressed because it is too large Load Diff

View File

@@ -74,6 +74,7 @@ void PASTEF77S(ch,blasname) \
*/ \
if ((*n) <= 0 || PASTEMAC(ch, eq0)(*alpha)) \
{ \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
return; \
} \
@@ -104,6 +105,7 @@ void PASTEF77S(ch,blasname) \
NULL \
); \
\
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -144,6 +146,7 @@ void saxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(s, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -276,6 +279,7 @@ void daxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(d, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -378,6 +382,13 @@ void daxpy_blis_impl
axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AXPYV_KER, cntx);
}
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
#ifdef BLIS_ENABLE_OPENMP
#ifdef AOCL_DYNAMIC
/* Invoking the fast-path, if the size is ideal for such execution */
@@ -392,16 +403,11 @@ void daxpy_blis_impl
y0, incy0,
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
#endif
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
/*
For the given problem size and architecture, the function
@@ -431,6 +437,7 @@ void daxpy_blis_impl
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
@@ -487,6 +494,7 @@ void daxpy_blis_impl
}
#endif // BLIS_ENABLE_OPENMP
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -527,6 +535,7 @@ void caxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(c, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -606,6 +615,7 @@ void caxpy_blis_impl
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -646,6 +656,7 @@ void zaxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(z, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -729,13 +740,13 @@ void zaxpy_blis_impl
axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx);
}
#ifdef BLIS_ENABLE_OPENMP
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
#ifdef BLIS_ENABLE_OPENMP
/*
For the given problem size and architecture, the function
returns the optimum number of threads with AOCL dynamic enabled
@@ -764,6 +775,7 @@ void zaxpy_blis_impl
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
@@ -776,8 +788,8 @@ void zaxpy_blis_impl
thrinfo_t thread;
/* The factor by which the size should be a multiple during thread partition.
The main loop of the kernel can handle 32 elements at a time hence 32 is
selected for block_size. */
The main loop of the kernel can handle 32 elements at a time hence 32 is
selected for block_size. */
dim_t block_size = 32;
// Get the thread ID
@@ -819,7 +831,7 @@ void zaxpy_blis_impl
);
}
#endif // BLIS_ENABLE_OPENMP
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here

View File

@@ -81,7 +81,8 @@ void PASTEF77S(ch,blasname) \
); \
\
\
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -203,10 +204,12 @@ void scopy_blis_impl
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
}
#ifdef BLIS_ENABLE_BLAS
void scopy_
(
@@ -336,6 +339,13 @@ void dcopy_blis_impl
copyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
}
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
#ifdef BLIS_ENABLE_OPENMP
#ifdef AOCL_DYNAMIC
@@ -351,19 +361,13 @@ void dcopy_blis_impl
y0, incy0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
return;
}
#endif
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
/*
For the given problem size and architecture, the function
returns the optimum number of threads with AOCL dynamic enabled
@@ -394,7 +398,7 @@ void dcopy_blis_impl
y0, incy0,
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
@@ -450,13 +454,13 @@ void dcopy_blis_impl
}
#endif // BLIS_ENABLE_OPENMP
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
}
#ifdef BLIS_ENABLE_BLAS
} // end of function
#ifdef BLIS_ENABLE_BLAS
void dcopy_
(
const f77_int* n,
@@ -565,13 +569,13 @@ void zcopy_blis_impl
copyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_COPYV_KER, cntx);
}
#ifdef BLIS_ENABLE_OPENMP
/*
Initializing the number of thread to one
to avoid compiler warnings
*/
dim_t nt = 1;
#ifdef BLIS_ENABLE_OPENMP
/*
For the given problem size and architecture, the function
returns the optimum number of threads with AOCL dynamic enabled
@@ -603,6 +607,7 @@ void zcopy_blis_impl
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
@@ -611,46 +616,47 @@ void zcopy_blis_impl
_Pragma("omp parallel num_threads(nt)")
{
dim_t start, length;
dim_t start, length;
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Get the thread ID
dim_t thread_id = omp_get_thread_num();
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n0,
nt_use,
&start, &length,
thread_id
);
// Get the actual number of threads spawned
dim_t nt_use = omp_get_num_threads();
/*
Calculate the compute range for the current thread
based on the actual number of threads spawned
*/
bli_thread_vector_partition
(
n0,
nt_use,
&start, &length,
thread_id
);
// Adjust the local pointer for computation
dcomplex *x_thread_local = x0 + (start * incx0);
dcomplex *y_thread_local = y0 + (start * incy0);
// Adjust the local pointer for computation
dcomplex *x_thread_local = x0 + (start * incx0);
dcomplex *y_thread_local = y0 + (start * incy0);
// Invoke the function based on the kernel function pointer
copyv_ker_ptr
(
BLIS_NO_CONJUGATE,
length,
x_thread_local, incx0,
y_thread_local, incy0,
cntx
);
// Invoke the function based on the kernel function pointer
copyv_ker_ptr
(
BLIS_NO_CONJUGATE,
length,
x_thread_local, incx0,
y_thread_local, incy0,
cntx
);
}
#endif
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
}
#ifdef BLIS_ENABLE_BLAS
void zcopy_
(

View File

@@ -216,6 +216,7 @@ void dgemv_blis_impl
if ( *m == 0 || *n == 0 || \
( PASTEMAC(d,eq0)( *alpha ) && PASTEMAC(d,eq1)( *beta ) ) )
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -257,6 +258,7 @@ void dgemv_blis_impl
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -310,7 +312,7 @@ void dgemv_blis_impl
NULL,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -338,7 +340,7 @@ void dgemv_blis_impl
y0, incy0,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here

View File

@@ -80,6 +80,7 @@ void PASTEF772S(chx,cha,blasname) \
\
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
{ \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -105,6 +106,7 @@ void PASTEF772S(chx,cha,blasname) \
NULL \
); \
\
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -145,7 +147,8 @@ void sscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -195,6 +198,7 @@ void sscal_blis_impl
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -234,7 +238,8 @@ void dscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -292,7 +297,6 @@ void dscal_blis_impl
// Query the function pointer using the context
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
}
#ifdef BLIS_ENABLE_OPENMP
@@ -316,7 +320,8 @@ void dscal_blis_impl
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -391,8 +396,8 @@ void dscal_blis_impl
cntx
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
#endif
@@ -432,7 +437,8 @@ void zdscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -502,8 +508,8 @@ void zdscal_blis_impl
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -568,7 +574,8 @@ void zdscal_blis_impl
);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
#endif
@@ -609,7 +616,8 @@ void cscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -661,11 +669,12 @@ void cscal_blis_impl
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
}
#ifdef BLIS_ENABLE_BLAS
void cscal_
(
@@ -702,7 +711,8 @@ void zscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
return;
@@ -754,8 +764,8 @@ void zscal_blis_impl
x0, incx0,
cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
}

View File

@@ -65,6 +65,7 @@
\
PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
\
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
\
/* Finalize BLIS. */ \

View File

@@ -1179,6 +1179,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1224,7 +1225,8 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
cntx
);
}
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
} // end of function
/*
* Multi-threaded GEMV M-kernel with division along N dimension
@@ -1290,6 +1292,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1342,6 +1345,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1363,6 +1367,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
(
transa, conjx, m, n, alpha, a, rs_a, cs_a, x, incx, beta, y, incy, cntx
);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1428,6 +1433,8 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
{
bli_pba_release(&rntm, &local_mem_buf);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
}
#endif
@@ -1449,18 +1456,18 @@ void bli_dgemv_n_zen4_int (
cntx_t* cntx
)
{
void (*ker_ft) ( trans_t,
conj_t,
dim_t,
dim_t,
double*,
void (*ker_ft) ( trans_t,
conj_t,
dim_t,
dim_t,
double*,
double*,
inc_t,
inc_t,
double*,
inc_t,
double*,
double*,
inc_t,
inc_t,
double*,
inc_t,
double*,
double*,
inc_t, cntx_t* ) = NULL;
// If AOCL_DYNAMIC is enabled, call ST kernels for small sizes.
@@ -1479,6 +1486,7 @@ void bli_dgemv_n_zen4_int (
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
}
else
#endif
@@ -1502,6 +1510,7 @@ void bli_dgemv_n_zen4_int (
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
#endif
}
@@ -1510,7 +1519,12 @@ void bli_dgemv_n_zen4_int (
if ( incy != 1 || transa != BLIS_NO_TRANSPOSE)
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
// AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
// I am commenting out the above line because
// it ends up calling twice sometimes.
// Need to fix it later !!
}
// Call the function pointer
ker_ft
(
transa,