Fix DTL dynamic thread logging in BLAS operations (#230)

- Remove redundant AOCL_DTL_LOG_NUM_THREADS calls from early return paths
- Update thread count logging to use AOCL_get_requested_threads_count() for early exits
- Clean up duplicate DTL logging in gemv_unf_var1 and gemv_unf_var2 implementations
- Remove thread count logging from bli_dgemv_n_zen4_int kernel variants
- Simplify aocldtl_blis.c AOCL_DTL_log_gemv_sizes by removing redundant conditional
- Standardize DTL trace exit patterns across axpy, scal, and gemv operations
- Remove commented-out DTL logging code in zen4 gemv kernel

This patch ensures thread count is logged only once per operation and uses
the correct API (AOCL_get_requested_threads_count) for early exit scenarios
where the actual execution thread count may differ from requested threads.
This commit is contained in:
Varaganti, Kiran
2025-10-24 18:04:00 +05:30
committed by KR, Chandrashekara
parent 7341b12c46
commit b729473839
9 changed files with 30 additions and 56 deletions

View File

@@ -717,18 +717,10 @@ void AOCL_DTL_log_gemv_sizes(int8 loglevel,
DTL_get_complex_parts(dt_type, beta, &beta_real, &beta_imag);
// {S, D,C, Z} { transa, m, n, alpha, lda, incx, beta, incy}
if (dt_type == 'd' || dt_type == 'D' )
{
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld", tolower(dt_type),
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
}
else
{
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
}
sprintf(buffer, "%c %c %ld %ld %lf %lf %ld %ld %lf %lf %ld\n", tolower(dt_type),
transa, (dim_t)m, (dim_t)n, alpha_real, alpha_imag,
(dim_t)lda, (dim_t)incx, beta_real, beta_imag, (dim_t)incy);
DTL_Trace(loglevel, TRACE_TYPE_LOG, function_name, function_name, line, buffer);
}

View File

@@ -247,7 +247,6 @@ void bli_dgemv_unf_var1
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -385,7 +384,6 @@ void bli_dgemv_unf_var1
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
}
@@ -404,7 +402,6 @@ void bli_dgemv_unf_var1
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -495,8 +492,6 @@ void bli_dgemv_unf_var1
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
#if defined(BLIS_ENABLE_OPENMP)
}
else
@@ -574,7 +569,6 @@ void bli_dgemv_unf_var1
cntx
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
}
#endif
}
@@ -594,7 +588,7 @@ void bli_dgemv_unf_var1
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
}
// If x was packed into x_temp, free the memory.

View File

@@ -342,8 +342,8 @@ void bli_dgemv_unf_var2 (
y, incy,
cntx
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
default:
@@ -477,7 +477,6 @@ void bli_dgemv_unf_var2 (
if( bli_deq0( *alpha ) )
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
@@ -522,7 +521,7 @@ void bli_dgemv_unf_var2 (
// Return the buffer to pool
bli_pba_release(&rntm , &mem_bufY);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}

View File

@@ -146,7 +146,7 @@ void saxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(s, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -280,7 +280,7 @@ void daxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(d, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -549,7 +549,7 @@ void caxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(c, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}
@@ -670,7 +670,7 @@ void zaxpy_blis_impl
*/
if ((*n) <= 0 || PASTEMAC(z, eq0)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
return;
}

View File

@@ -216,7 +216,6 @@ void dgemv_blis_impl
if ( *m == 0 || *n == 0 || \
( PASTEMAC(d,eq0)( *alpha ) && PASTEMAC(d,eq1)( *beta ) ) )
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -258,7 +257,6 @@ void dgemv_blis_impl
this quirky behavior; it will scale y by beta, as one would expect. */
if ( m_y > 0 && n_x == 0 )
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -312,7 +310,6 @@ void dgemv_blis_impl
NULL,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -340,7 +337,7 @@ void dgemv_blis_impl
y0, incy0,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here

View File

@@ -80,10 +80,10 @@ void PASTEF772S(chx,cha,blasname) \
\
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(chau, eq1)(*alpha)) \
{ \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count()); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
return ; \
} \
\
@@ -106,7 +106,7 @@ void PASTEF772S(chx,cha,blasname) \
NULL \
); \
\
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
@@ -147,7 +147,7 @@ void sscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(s, eq1)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -238,7 +238,7 @@ void dscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -437,7 +437,7 @@ void zdscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(d, eq1)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -616,7 +616,7 @@ void cscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(c, eq1)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here
@@ -711,7 +711,7 @@ void zscal_blis_impl
*/
if ((n0 <= 0) || (alpha == NULL) || (incx0 <= 0) || PASTEMAC(z, eq1)(*alpha))
{
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, AOCL_get_requested_threads_count());
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
/* Finalize BLIS. */
// Call to bli_finalize_auto() is not needed here

View File

@@ -65,7 +65,6 @@
\
PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
\
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1); \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
\
/* Finalize BLIS. */ \

View File

@@ -341,10 +341,10 @@ BLIS_EXPORT_BLIS void bli_daxpyv_zen4_int
_mm_storel_pd(y0, y_vec);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return;
}
__m512d xv[8], yv[8], alphav;
// Broadcast the alpha scalar to all elements of a vector register.

View File

@@ -1179,7 +1179,6 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1225,7 +1224,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Mdiv
cntx
);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
} // end of function
/*
@@ -1292,8 +1291,8 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
y, incy,
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
return;
}
@@ -1345,7 +1344,6 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
NULL
);
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return;
}
@@ -1434,7 +1432,7 @@ void bli_dgemv_m_zen4_int_40x8_mt_Ndiv
bli_pba_release(&rntm, &local_mem_buf);
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, nt);
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4);
}
#endif
@@ -1486,7 +1484,7 @@ void bli_dgemv_n_zen4_int (
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
}
else
#endif
@@ -1510,7 +1508,6 @@ void bli_dgemv_n_zen4_int (
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
}
AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
#endif
}
@@ -1519,10 +1516,6 @@ void bli_dgemv_n_zen4_int (
if ( incy != 1 || transa != BLIS_NO_TRANSPOSE)
{
ker_ft = bli_dgemv_n_zen4_int_32x8_st;
// AOCL_DTL_LOG_NUM_THREADS(AOCL_DTL_LEVEL_TRACE_1, 1);
// I am commenting out the above line because
// it ends up calling twice sometimes.
// Need to fix it later !!
}
// Call the function pointer
ker_ft