mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
DTL Log update
* DTL Log update Updates logs with nt and AOCL Dynamic selected nt for axpy, scal and dgemv Modified bench_gemv.c to able to process modified dtl logs. * Updated DTL log for copy routine with actual nt and dynamic nt * Refactor OpenMP pragmas and clean up code Removed unnecessary nested OpenMP pragma and cleaned up function end comment. * Fixed DTL log for sequential build * Added thread logging in bla_gemv_check for invalid inputs --------- Co-authored-by: Smyth, Edward <Edward.Smyth@amd.com>
This commit is contained in:
@@ -82,6 +82,17 @@ void DTL_get_complex_parts(char dt_type,
|
||||
}
|
||||
}
|
||||
|
||||
void AOCL_DTL_log_num_threads(int8 loglevel,
|
||||
dim_t num_threads
|
||||
)
|
||||
{
|
||||
char buffer[256];
|
||||
|
||||
sprintf(buffer, " nt=%ld dynamic_nt=%ld", AOCL_get_requested_threads_count(), num_threads);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_RAW, NULL, NULL, 0, buffer);
|
||||
}
|
||||
|
||||
// Level-3 Logging
|
||||
|
||||
void AOCL_DTL_log_gemm_sizes(int8 loglevel,
|
||||
@@ -706,9 +717,18 @@ void AOCL_DTL_log_gemv_sizes(int8 loglevel,
|
||||
DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag);
|
||||
|
||||
// {S, D,C, Z} { transa, m, n, alpha, lda, incx, beta, incy}
|
||||
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
|
||||
if (dt_type == 'd' || dt_type == 'D' )
|
||||
{
|
||||
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld", tolower(dt_type),
|
||||
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
|
||||
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
|
||||
}
|
||||
else
|
||||
{
|
||||
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
|
||||
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
|
||||
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
|
||||
}
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
@@ -1499,7 +1519,7 @@ void AOCL_DTL_log_axpy_sizes(int8 loglevel,
|
||||
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
|
||||
|
||||
// {S, D, C, Z} {n, alpha_real, alpha_imag, incx, incy}
|
||||
sprintf(buffer, "%c %ld %lf %lf %ld %ld\n", tolower(dt_type),
|
||||
sprintf(buffer, "%c %ld %lf %lf %ld %ld", tolower(dt_type),
|
||||
(dim_t)n, alpha_real, alpha_imag, (dim_t)incx, (dim_t)incy);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
@@ -1516,7 +1536,7 @@ void AOCL_DTL_log_copy_sizes(int8 loglevel,
|
||||
{
|
||||
char buffer[256];
|
||||
// {S, D, C, Z} {n, incx, incy}
|
||||
sprintf(buffer, "%c %ld %ld %ld\n", tolower(dt_type), (dim_t)n, (dim_t)incx, (dim_t)incy);
|
||||
sprintf(buffer, "%c %ld %ld %ld", tolower(dt_type), (dim_t)n, (dim_t)incx, (dim_t)incy);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
@@ -1599,8 +1619,8 @@ void AOCL_DTL_log_scal_sizes(int8 loglevel,
|
||||
DTL_get_complex_parts(dt_type, alpha, &alpha_real, &alpha_imag);
|
||||
|
||||
// {S, D, C, Z} { alpha, n, incx}
|
||||
sprintf(buffer, "%c %lf %lf %ld %ld\n", tolower(dt_type),
|
||||
alpha_real, alpha_imag, (dim_t)n, (dim_t)incx);
|
||||
sprintf(buffer, "%c %lf %lf %ld %ld", tolower(dt_type),
|
||||
alpha_real, alpha_imag, (dim_t)n, (dim_t)incx);
|
||||
|
||||
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
|
||||
}
|
||||
|
||||
@@ -46,6 +46,10 @@
|
||||
#if AOCL_DTL_LOG_ENABLE
|
||||
dim_t AOCL_get_requested_threads_count(void);
|
||||
|
||||
void AOCL_DTL_log_num_threads(int8 loglevel,
|
||||
dim_t num_threads
|
||||
);
|
||||
|
||||
// Level-3 Logging
|
||||
|
||||
void AOCL_DTL_log_gemm_sizes(int8 loglevel,
|
||||
@@ -760,6 +764,11 @@ void AOCL_DTL_log_matcopy2_sizes(int8 loglevel,
|
||||
const char* function_name,
|
||||
int line);
|
||||
|
||||
|
||||
#define AOCL_DTL_LOG_NUM_THREADS(loglevel, num_threads) \
|
||||
if (tlIsLoggingEnabled) \
|
||||
AOCL_DTL_log_num_threads(loglevel, num_threads);
|
||||
|
||||
// Level-3 Macros
|
||||
|
||||
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc) \
|
||||
@@ -1059,6 +1068,8 @@ void AOCL_DTL_log_matcopy2_sizes(int8 loglevel,
|
||||
|
||||
#else // AOCL_DTL_LOG_ENABLE
|
||||
|
||||
#define AOCL_DTL_LOG_NUM_THREADS(loglevel, num_threads)
|
||||
|
||||
// Level-3 Macros
|
||||
|
||||
#define AOCL_DTL_LOG_GEMM_INPUTS(loglevel, dt, transa, transb, m, n, k, alpha, lda, ldb, beta, ldc)
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
|
||||
|
||||
#define AOCL_MATRIX_INITIALISATION
|
||||
#define BUFFER_SIZE 256
|
||||
|
||||
//#define BLIS_ENABLE_CBLAS
|
||||
|
||||
@@ -100,6 +101,11 @@ int main( int argc, char** argv )
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (argc > 3)
|
||||
{
|
||||
n_repeats = atoi(argv[3]);
|
||||
}
|
||||
|
||||
fprintf(fout, "Func Dt transa m n alphaR alphaI lda incx betaR betaI incy gflops\n");
|
||||
|
||||
char transA;
|
||||
@@ -109,14 +115,21 @@ int main( int argc, char** argv )
|
||||
inc_t lda;
|
||||
inc_t incx;
|
||||
inc_t incy;
|
||||
char tmp[256]; // to store function name, line no present in logs.
|
||||
//char tmp[256]; // to store function name, line no present in logs.
|
||||
// Following variables are needed for scanf to read inputs properly
|
||||
// however they are not used in bench.
|
||||
char api_name[BUFFER_SIZE]; // to store function name, line no present in logs
|
||||
char dummy_buffer[BUFFER_SIZE];
|
||||
|
||||
|
||||
// {S,D,C,Z} {transa m n alpha lda, incx, beta, incy}
|
||||
while (fscanf(fin, "%s %c %c " INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS "\n",
|
||||
tmp, &dt_ch, &transA, &m, &n, &alpha_r, &alpha_i, &lda,\
|
||||
while (fscanf(fin, "%s %c %c " INT_FS INT_FS " %lf %lf " INT_FS INT_FS " %lf %lf " INT_FS"[^\n]",
|
||||
api_name, &dt_ch, &transA, &m, &n, &alpha_r, &alpha_i, &lda,\
|
||||
&incx, &beta_r, &beta_i, &incy) == 12)
|
||||
{
|
||||
// Discard any extra data on current line in the input file.
|
||||
fgets(dummy_buffer, BUFFER_SIZE, fin );
|
||||
|
||||
if (dt_ch == 'D' || dt_ch == 'd') dt = BLIS_DOUBLE;
|
||||
else if (dt_ch == 'Z' || dt_ch == 'z') dt = BLIS_DCOMPLEX;
|
||||
else if (dt_ch == 'S' || dt_ch == 's') dt = BLIS_FLOAT;
|
||||
@@ -398,7 +411,7 @@ int main( int argc, char** argv )
|
||||
gflops);
|
||||
|
||||
fprintf (fout, "%s %c %c" INT_FS INT_FS " %lf %lf" INT_FS INT_FS " %lf %lf " INT_FS " %6.3f\n",
|
||||
tmp, dt_ch, transA, m, n, alpha_r, alpha_i, lda,\
|
||||
api_name, dt_ch, transA, m, n, alpha_r, alpha_i, lda,\
|
||||
incx, beta_r, beta_i, incy, gflops);
|
||||
|
||||
fflush(fout);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
dgemv_blis_impl d N 8 5 0.000000 0.000000 9 1 0.000000 0.000000 2 nt=192 dynamic_nt=1
|
||||
dgemv_:173: D N 1 14 -1.000000 0.000000 10000 1 1.000000 0.000000 1
|
||||
dgemv_:173: D N 1 14 -1.000000 0.000000 30000 1 1.000000 0.000000 1
|
||||
dgemv_:173: D N 1 2 -1.000000 0.000000 100 1 1.000000 0.000000 1
|
||||
|
||||
@@ -247,6 +247,7 @@ void bli_dgemv_unf_var1
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
@@ -333,6 +334,7 @@ void bli_dgemv_unf_var1
|
||||
gemv_kr_ptr = bli_dgemv_t_zen_int; // DGEMV
|
||||
scalv_kr_ptr = bli_dscalv_zen_int; // DSCALV
|
||||
copyv_kr_ptr = bli_dcopyv_zen_int; // DCOPYV
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP) && defined(AOCL_DYNAMIC)
|
||||
fast_path_thresh = 13000;
|
||||
#endif
|
||||
@@ -383,10 +385,11 @@ void bli_dgemv_unf_var1
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// If alpha is zero, the GEMV operation is reduced to y := beta * y, thus,
|
||||
// y is only scaled by beta and returned.
|
||||
if( bli_deq0( *alpha ) )
|
||||
@@ -401,6 +404,7 @@ void bli_dgemv_unf_var1
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
@@ -491,6 +495,8 @@ void bli_dgemv_unf_var1
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
|
||||
#if defined(BLIS_ENABLE_OPENMP)
|
||||
}
|
||||
else
|
||||
@@ -568,6 +574,7 @@ void bli_dgemv_unf_var1
|
||||
cntx
|
||||
);
|
||||
}
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -587,6 +594,7 @@ void bli_dgemv_unf_var1
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
}
|
||||
|
||||
// If x was packed into x_temp, free the memory.
|
||||
|
||||
@@ -342,7 +342,7 @@ void bli_dgemv_unf_var2 (
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
|
||||
@@ -477,7 +477,8 @@ void bli_dgemv_unf_var2 (
|
||||
|
||||
if( bli_deq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -521,7 +522,7 @@ void bli_dgemv_unf_var2 (
|
||||
// Return the buffer to pool
|
||||
bli_pba_release(&rntm , &mem_bufY);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -74,6 +74,7 @@ void PASTEF77S(ch,blasname) \
|
||||
*/ \
|
||||
if ((*n) <= 0 || PASTEMAC(ch, eq0)(*alpha)) \
|
||||
{ \
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
return; \
|
||||
} \
|
||||
@@ -104,6 +105,7 @@ void PASTEF77S(ch,blasname) \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
@@ -144,6 +146,7 @@ void saxpy_blis_impl
|
||||
*/
|
||||
if ((*n) <= 0 || PASTEMAC(s, eq0)(*alpha))
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
@@ -276,6 +279,7 @@ void daxpy_blis_impl
|
||||
*/
|
||||
if ((*n) <= 0 || PASTEMAC(d, eq0)(*alpha))
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
@@ -378,6 +382,13 @@ void daxpy_blis_impl
|
||||
axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AXPYV_KER, cntx);
|
||||
}
|
||||
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
#ifdef AOCL_DYNAMIC
|
||||
/* Invoking the fast-path, if the size is ideal for such execution */
|
||||
@@ -392,16 +403,11 @@ void daxpy_blis_impl
|
||||
y0, incy0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
/*
|
||||
For the given problem size and architecture, the function
|
||||
@@ -431,6 +437,7 @@ void daxpy_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
|
||||
@@ -487,6 +494,7 @@ void daxpy_blis_impl
|
||||
}
|
||||
#endif // BLIS_ENABLE_OPENMP
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -527,6 +535,7 @@ void caxpy_blis_impl
|
||||
*/
|
||||
if ((*n) <= 0 || PASTEMAC(c, eq0)(*alpha))
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
@@ -606,6 +615,7 @@ void caxpy_blis_impl
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -646,6 +656,7 @@ void zaxpy_blis_impl
|
||||
*/
|
||||
if ((*n) <= 0 || PASTEMAC(z, eq0)(*alpha))
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
@@ -729,13 +740,13 @@ void zaxpy_blis_impl
|
||||
axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx);
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
For the given problem size and architecture, the function
|
||||
returns the optimum number of threads with AOCL dynamic enabled
|
||||
@@ -764,6 +775,7 @@ void zaxpy_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
|
||||
@@ -776,8 +788,8 @@ void zaxpy_blis_impl
|
||||
thrinfo_t thread;
|
||||
|
||||
/* The factor by which the size should be a multiple during thread partition.
|
||||
The main loop of the kernel can handle 32 elements at a time hence 32 is
|
||||
selected for block_size. */
|
||||
The main loop of the kernel can handle 32 elements at a time hence 32 is
|
||||
selected for block_size. */
|
||||
dim_t block_size = 32;
|
||||
|
||||
// Get the thread ID
|
||||
@@ -819,7 +831,7 @@ void zaxpy_blis_impl
|
||||
);
|
||||
}
|
||||
#endif // BLIS_ENABLE_OPENMP
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
|
||||
@@ -81,7 +81,8 @@ void PASTEF77S(ch,blasname) \
|
||||
); \
|
||||
\
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
@@ -203,10 +204,12 @@ void scopy_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void scopy_
|
||||
(
|
||||
@@ -336,6 +339,13 @@ void dcopy_blis_impl
|
||||
copyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_COPYV_KER, cntx);
|
||||
}
|
||||
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
@@ -351,19 +361,13 @@ void dcopy_blis_impl
|
||||
y0, incy0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
/*
|
||||
For the given problem size and architecture, the function
|
||||
returns the optimum number of threads with AOCL dynamic enabled
|
||||
@@ -394,7 +398,7 @@ void dcopy_blis_impl
|
||||
y0, incy0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
|
||||
@@ -450,13 +454,13 @@ void dcopy_blis_impl
|
||||
}
|
||||
|
||||
#endif // BLIS_ENABLE_OPENMP
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
}
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
} // end of function
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void dcopy_
|
||||
(
|
||||
const f77_int* n,
|
||||
@@ -565,13 +569,13 @@ void zcopy_blis_impl
|
||||
copyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_COPYV_KER, cntx);
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
Initializing the number of thread to one
|
||||
to avoid compiler warnings
|
||||
*/
|
||||
dim_t nt = 1;
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
/*
|
||||
For the given problem size and architecture, the function
|
||||
returns the optimum number of threads with AOCL dynamic enabled
|
||||
@@ -603,6 +607,7 @@ void zcopy_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
|
||||
@@ -611,46 +616,47 @@ void zcopy_blis_impl
|
||||
|
||||
_Pragma("omp parallel num_threads(nt)")
|
||||
{
|
||||
dim_t start, length;
|
||||
dim_t start, length;
|
||||
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
// Get the thread ID
|
||||
dim_t thread_id = omp_get_thread_num();
|
||||
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n0,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
// Get the actual number of threads spawned
|
||||
dim_t nt_use = omp_get_num_threads();
|
||||
/*
|
||||
Calculate the compute range for the current thread
|
||||
based on the actual number of threads spawned
|
||||
*/
|
||||
bli_thread_vector_partition
|
||||
(
|
||||
n0,
|
||||
nt_use,
|
||||
&start, &length,
|
||||
thread_id
|
||||
);
|
||||
|
||||
// Adjust the local pointer for computation
|
||||
dcomplex *x_thread_local = x0 + (start * incx0);
|
||||
dcomplex *y_thread_local = y0 + (start * incy0);
|
||||
// Adjust the local pointer for computation
|
||||
dcomplex *x_thread_local = x0 + (start * incx0);
|
||||
dcomplex *y_thread_local = y0 + (start * incy0);
|
||||
|
||||
// Invoke the function based on the kernel function pointer
|
||||
copyv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
length,
|
||||
x_thread_local, incx0,
|
||||
y_thread_local, incy0,
|
||||
cntx
|
||||
);
|
||||
// Invoke the function based on the kernel function pointer
|
||||
copyv_ker_ptr
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
length,
|
||||
x_thread_local, incx0,
|
||||
y_thread_local, incy0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void zcopy_
|
||||
(
|
||||
|
||||
@@ -216,6 +216,7 @@ void dgemv_blis_impl
|
||||
if ( *m == 0 || *n == 0 || \
|
||||
( PASTEMAC(d,eq0)( *alpha ) && PASTEMAC(d,eq1)( *beta ) ) )
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -257,6 +258,7 @@ void dgemv_blis_impl
|
||||
this quirky behavior; it will scale y by beta, as one would expect. */
|
||||
if ( m_y > 0 && n_x == 0 )
|
||||
{
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -310,7 +312,7 @@ void dgemv_blis_impl
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -338,7 +340,7 @@ void dgemv_blis_impl
|
||||
y0, incy0,
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
|
||||
@@ -80,6 +80,7 @@ void PASTEF772S(chx,cha,blasname) \
|
||||
\
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
|
||||
{ \
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
@@ -105,6 +106,7 @@ void PASTEF772S(chx,cha,blasname) \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
@@ -145,7 +147,8 @@ void sscal_blis_impl
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -195,6 +198,7 @@ void sscal_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
@@ -234,7 +238,8 @@ void dscal_blis_impl
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -292,7 +297,6 @@ void dscal_blis_impl
|
||||
|
||||
// Query the function pointer using the context
|
||||
scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
|
||||
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
@@ -316,7 +320,8 @@ void dscal_blis_impl
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -391,8 +396,8 @@ void dscal_blis_impl
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
#endif
|
||||
@@ -432,7 +437,8 @@ void zdscal_blis_impl
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -502,8 +508,8 @@ void zdscal_blis_impl
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -568,7 +574,8 @@ void zdscal_blis_impl
|
||||
);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
#endif
|
||||
@@ -609,7 +616,8 @@ void cscal_blis_impl
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -661,11 +669,12 @@ void cscal_blis_impl
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1)
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
void cscal_
|
||||
(
|
||||
@@ -702,7 +711,8 @@ void zscal_blis_impl
|
||||
*/
|
||||
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
return;
|
||||
@@ -754,8 +764,8 @@ void zscal_blis_impl
|
||||
x0, incx0,
|
||||
cntx
|
||||
);
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1)
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
|
||||
/* Finalize BLIS. */
|
||||
// Call to bli_finalize_auto() is not needed here
|
||||
}
|
||||
|
||||
@@ -65,6 +65,7 @@
|
||||
\
|
||||
PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
|
||||
\
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
|
||||
@@ -1179,6 +1179,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return;
|
||||
}
|
||||
@@ -1224,7 +1225,8 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
} // end of function
|
||||
|
||||
/*
|
||||
* Multi-threaded GEMV M-kernel with division along N dimension
|
||||
@@ -1290,6 +1292,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return;
|
||||
}
|
||||
@@ -1342,6 +1345,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
|
||||
NULL
|
||||
);
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return;
|
||||
}
|
||||
@@ -1363,6 +1367,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
|
||||
(
|
||||
transa, conjx, m, n, alpha, a, rs_a, cs_a, x, incx, beta, y, incy, cntx
|
||||
);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1428,6 +1433,8 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
|
||||
{
|
||||
bli_pba_release(&rntm, &local_mem_buf);
|
||||
}
|
||||
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1449,18 +1456,18 @@ void bli_dgemv_n_zen4_int (
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
void (*ker_ft) ( trans_t,
|
||||
conj_t,
|
||||
dim_t,
|
||||
dim_t,
|
||||
double*,
|
||||
void (*ker_ft) ( trans_t,
|
||||
conj_t,
|
||||
dim_t,
|
||||
dim_t,
|
||||
double*,
|
||||
double*,
|
||||
inc_t,
|
||||
inc_t,
|
||||
double*,
|
||||
inc_t,
|
||||
double*,
|
||||
double*,
|
||||
inc_t,
|
||||
inc_t,
|
||||
double*,
|
||||
inc_t,
|
||||
double*,
|
||||
double*,
|
||||
inc_t, cntx_t* ) = NULL;
|
||||
|
||||
// If AOCL_DYNAMIC is enabled, call ST kernels for small sizes.
|
||||
@@ -1479,6 +1486,7 @@ void bli_dgemv_n_zen4_int (
|
||||
{
|
||||
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
|
||||
}
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@@ -1502,6 +1510,7 @@ void bli_dgemv_n_zen4_int (
|
||||
{
|
||||
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
|
||||
}
|
||||
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1510,7 +1519,12 @@ void bli_dgemv_n_zen4_int (
|
||||
if ( incy != 1 || transa != BLIS_NO_TRANSPOSE)
|
||||
{
|
||||
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
|
||||
// AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
|
||||
// I am commenting out the above line because
|
||||
// it ends up calling twice sometimes.
|
||||
// Need to fix it later !!
|
||||
}
|
||||
// Call the function pointer
|
||||
ker_ft
|
||||
(
|
||||
transa,
|
||||
|
||||
Reference in New Issue
Block a user