Finer control of code path options (#67)

Add macros to allow specific code options to be enabled or disabled,
controlled by options to configure and cmake. This expands on the
existing GEMM and/or TRSM functionality to enable/disable SUP handling
and replaces the hard coded #define in include files to enable small matrix
paths.

All options are enabled by default for all BLIS sub-configs but many of them
are currently only implemented in AMD specific framework code variants.

AMD-Internal: [CPUPL-6906]
---------

Co-authored-by: Varaganti, Kiran <Kiran.Varaganti@amd.com>
This commit is contained in:
Smyth, Edward
2025-07-08 10:59:23 +01:00
committed by GitHub
parent 9b02201b5b
commit 969ceb7413
18 changed files with 304 additions and 43 deletions

View File

@@ -277,7 +277,12 @@ option(ENABLE_BLAS "BLAS compatiblity layer" ON)
option(ENABLE_CBLAS "CBLAS compatiblity layer" OFF)
option(ENABLE_MIXED_DT "Mixed datatype support" ON)
option(ENABLE_MIXED_DT_EXTRA_MEM "Mixed datatype optimization requiring extra memory" ON)
option(ENABLE_SUP_HANDLING "Small matrix handling" ON)
option(ENABLE_MNK1_MATRIX "M, N or K = 1 matrix handling" ON)
option(ENABLE_TINY_MATRIX "Tiny matrix handling" ON)
option(ENABLE_SMALL_MATRIX "Small matrix handling" ON)
option(ENABLE_SUP_HANDLING "SUP matrix handling" ON)
option(ENABLE_SMALL_MATRIX_TRSM "TRSM Small matrix handling" ON)
option(ENABLE_TRSM_PREINVERSION "Enable TRSM preinversion" ON)
if(WIN32)
set(ENABLE_MEMKIND "no" CACHE STRING "libmemkind for manage memory pools")
set_property(CACHE ENABLE_MEMKIND PROPERTY STRINGS "no")
@@ -292,7 +297,6 @@ else()
during CMake invokation: auto, yes, no")
endif()
endif()
option(ENABLE_TRSM_PREINVERSION "Enable TRSM preinversion" ON)
option(ENABLE_AOCL_DYNAMIC "Dynamic selection of number of threads" ON)
set(FORCE_VERSION "no" CACHE STRING "Force configure to use an arbitrary version string")
if(WIN32)
@@ -608,14 +612,46 @@ else()
set(ENABLE_MIXED_DT_EXTRA_MEM_01 0)
set(ENABLE_MIXED_DT_01 0)
endif()
cmake_print_variables(ENABLE_SUP_HANDLING)
if(ENABLE_SUP_HANDLING)
cmake_print_variables(ENABLE_MNK1_MATRIX)
if(ENABLE_MNK1_MATRIX)
message(" M, N or K = 1 matrix handling is enabled.")
set(ENABLE_MNK1_MATRIX_01 1)
else()
message(" M, N or K = 1 matrix handling is disabled.")
set(ENABLE_MNK1_MATRIX_01 0)
endif()
cmake_print_variables(ENABLE_TINY_MATRIX)
if(ENABLE_TINY_MATRIX)
message(" Tiny matrix handling is enabled.")
set(ENABLE_TINY_MATRIX_01 1)
else()
message(" Tiny matrix handling is disabled.")
set(ENABLE_TINY_MATRIX_01 0)
endif()
cmake_print_variables(ENABLE_SMALL_MATRIX)
if(ENABLE_SMALL_MATRIX)
message(" Small matrix handling is enabled.")
set(ENABLE_SUP_HANDLING_01 1)
set(ENABLE_SMALL_MATRIX_01 1)
else()
message(" Small matrix handling is disabled.")
set(ENABLE_SMALL_MATRIX_01 0)
endif()
cmake_print_variables(ENABLE_SUP_HANDLING)
if(ENABLE_SUP_HANDLING)
message(" SUP matrix handling is enabled.")
set(ENABLE_SUP_HANDLING_01 1)
else()
message(" SUP matrix handling is disabled.")
set(ENABLE_SUP_HANDLING_01 0)
endif()
cmake_print_variables(ENABLE_SMALL_MATRIX_TRSM)
if(ENABLE_SMALL_MATRIX)
message(" TRSM Small matrix handling is enabled.")
set(ENABLE_SMALL_MATRIX_TRSM_01 1)
else()
message(" TRSM Small matrix handling is disabled.")
set(ENABLE_SMALL_MATRIX_TRSM_01 0)
endif()
cmake_print_variables(ENABLE_TRSM_PREINVERSION)
if(ENABLE_TRSM_PREINVERSION)
message(" trsm diagonal element pre-inversion is enabled.")

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -162,12 +162,36 @@
#endif
#endif
#if @enable_mnk1_matrix@
#define BLIS_ENABLE_MNK1_MATRIX
#else
#define BLIS_DISABLE_MNK1_MATRIX
#endif
#if @enable_tiny_matrix@
#define BLIS_ENABLE_TINY_MATRIX
#else
#define BLIS_DISABLE_TINY_MATRIX
#endif
#if @enable_small_matrix@
#define BLIS_ENABLE_SMALL_MATRIX
#else
#define BLIS_DISABLE_SMALL_MATRIX
#endif
#if @enable_sup_handling@
#define BLIS_ENABLE_SUP_HANDLING
#else
#define BLIS_DISABLE_SUP_HANDLING
#endif
#if @enable_small_matrix_trsm@
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
#else
#define BLIS_DISABLE_SMALL_MATRIX_TRSM
#endif
#if @enable_memkind@
#define BLIS_ENABLE_MEMKIND
#else

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020 - 2024, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2020 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -160,12 +160,36 @@ ${KERNEL_LIST_DEFINES}
#endif
#endif
#if ${ENABLE_MNK1_MATRIX_01}
#define BLIS_ENABLE_MNK1_MATRIX
#else
#define BLIS_DISABLE_MNK1_MATRIX
#endif
#if ${ENABLE_TINY_MATRIX_01}
#define BLIS_ENABLE_TINY_MATRIX
#else
#define BLIS_DISABLE_TINY_MATRIX
#endif
#if ${ENABLE_SMALL_MATRIX_01}
#define BLIS_ENABLE_SMALL_MATRIX
#else
#define BLIS_DISABLE_SMALL_MATRIX
#endif
#if ${ENABLE_SUP_HANDLING_01}
#define BLIS_ENABLE_SUP_HANDLING
#else
#define BLIS_DISABLE_SUP_HANDLING
#endif
#if ${ENABLE_SMALL_MATRIX_TRSM_01}
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
#else
#define BLIS_DISABLE_SMALL_MATRIX_TRSM
#endif
#if ${ENABLE_MEMKIND_01}
#define BLIS_ENABLE_MEMKIND
#else

View File

@@ -45,9 +45,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -43,9 +43,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -44,9 +44,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -42,9 +42,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -42,9 +42,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -43,9 +43,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

View File

@@ -43,9 +43,6 @@
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160

117
configure vendored
View File

@@ -260,15 +260,58 @@ print_usage()
echo " only be enabled when mixed domain/precision support is"
echo " enabled."
echo " "
echo " --disable-sup-handling, --enable-sup-handling"
echo " --disable-mnk1-matrix, --enable-mnk1-matrix"
echo " "
echo " Disable (enabled by default) handling of matrix problem"
echo " where M, N or K = 1 via separate code branches. When disabled,"
echo " these operations will be performed by gemm rather than gemv"
echo " or other optimized implementations."
echo " "
echo " --disable-tiny-matrix, --enable-tiny-matrix"
echo " "
echo " Disable (enabled by default) handling of tiny"
echo " matrix problems via tiny code branches. When disabled,"
echo " these tiny level-3 operations will be performed by"
echo " the conventional implementation, which is optimized for"
echo " medium and large problems. Note that what qualifies as"
echo " \"tiny\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " Currently only of relevance on configs that include"
echo " AMD Zen sub-configs"
echo " "
echo " --disable-small-matrix, --enable-small-matrix"
echo " "
echo " Disable (enabled by default) handling of small/skinny"
echo " matrix problems via separate code branches. When disabled,"
echo " matrix problems via small code branches. When disabled,"
echo " these small/skinny level-3 operations will be performed by"
echo " the conventional implementation, which is optimized for"
echo " medium and large problems. Note that what qualifies as"
echo " \"small\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " Currently only of relevance on configs that include"
echo " AMD Zen sub-configs"
echo " "
echo " --disable-sup-handling, --enable-sup-handling"
echo " "
echo " Disable (enabled by default) handling of small/skinny"
echo " matrix problems via SUP code branches. When disabled,"
echo " these small/skinny level-3 operations will be performed by"
echo " the conventional implementation, which is optimized for"
echo " medium and large problems. Note that what qualifies as"
echo " \"SUP\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " "
echo " --disable-small-matrix-trsm, --enable-small-matrix-trsm"
echo " "
echo " Disable (enabled by default) handling of small/skinny"
echo " TRSM problems via small code branches. When disabled,"
echo " these small/skinny level-3 operations will be performed by"
echo " the conventional implementation, which is optimized for"
echo " medium and large problems. Note that what qualifies as"
echo " \"small\" depends on thresholds that may vary by sub-"
echo " configuration."
echo " Currently only of relevance on configs that include"
echo " AMD Zen sub-configs"
echo " "
echo " -a NAME --enable-addon=NAME"
echo " "
@@ -2108,9 +2151,15 @@ main()
enable_cblas='no'
enable_mixed_dt='yes'
enable_mixed_dt_extra_mem='yes'
enable_mnk1_matrix='yes'
enable_tiny_matrix='yes'
enable_small_matrix='yes'
enable_sup_handling='yes'
enable_memkind='' # The default memkind value is determined later on.
enable_small_matrix_trsm='yes'
enable_trsm_preinversion='yes'
enable_memkind='' # The default memkind value is determined later on.
enable_aocl_dynamic='yes'
force_version='no'
complex_return='default'
@@ -2319,12 +2368,36 @@ main()
disable-mixed-dt-extra-mem)
enable_mixed_dt_extra_mem='no'
;;
enable-mnk1-matrix)
enable_mnk1_matrix='yes'
;;
disable-mnk1-matrix)
enable_mnk1_matrix='no'
;;
enable-tiny-matrix)
enable_tiny_matrix='yes'
;;
disable-tiny-matrix)
enable_tiny_matrix='no'
;;
enable-small-matrix)
enable_small_matrix='yes'
;;
disable-small-matrix)
enable_small_matrix='no'
;;
enable-sup-handling)
enable_sup_handling='yes'
;;
disable-sup-handling)
enable_sup_handling='no'
;;
enable-small-matrix-trsm)
enable_small_matrix_trsm='yes'
;;
disable-small-matrix-trsm)
enable_small_matrix_trsm='no'
;;
with-memkind)
enable_memkind='yes'
;;
@@ -3213,13 +3286,41 @@ main()
enable_mixed_dt_extra_mem_01=0
enable_mixed_dt_01=0
fi
if [ "x${enable_sup_handling}" = "xyes" ]; then
if [ "x${enable_mnk1_matrix}" = "xyes" ]; then
echo "${script_name}: M,N,K=1 matrix handling is enabled."
enable_mnk1_matrix_01=1
else
echo "${script_name}: M,N,K=1 matrix handling is disabled."
enable_mnk1_matrix_01=0
fi
if [ "x${enable_tiny_matrix}" = "xyes" ]; then
echo "${script_name}: tiny matrix handling is enabled."
enable_tiny_matrix_01=1
else
echo "${script_name}: tiny matrix handling is disabled."
enable_tiny_matrix_01=0
fi
if [ "x${enable_small_matrix}" = "xyes" ]; then
echo "${script_name}: small matrix handling is enabled."
enable_sup_handling_01=1
enable_small_matrix_01=1
else
echo "${script_name}: small matrix handling is disabled."
enable_small_matrix_01=0
fi
if [ "x${enable_sup_handling}" = "xyes" ]; then
echo "${script_name}: SUP matrix handling is enabled."
enable_sup_handling_01=1
else
echo "${script_name}: SUP matrix handling is disabled."
enable_sup_handling_01=0
fi
if [ "x${enable_small_matrix_trsm}" = "xyes" ]; then
echo "${script_name}: TRSM small matrix handling is enabled."
enable_small_matrix_trsm_01=1
else
echo "${script_name}: TRSM small matrix handling is disabled."
enable_small_matrix_trsm_01=0
fi
if [ "x${enable_trsm_preinversion}" = "xyes" ]; then
echo "${script_name}: trsm diagonal element pre-inversion is enabled."
enable_trsm_preinversion_01=1
@@ -3586,9 +3687,13 @@ main()
| sed -e "s/@enable_cblas@/${enable_cblas_01}/g" \
| sed -e "s/@enable_mixed_dt@/${enable_mixed_dt_01}/g" \
| sed -e "s/@enable_mixed_dt_extra_mem@/${enable_mixed_dt_extra_mem_01}/g" \
| sed -e "s/@enable_mnk1_matrix@/${enable_mnk1_matrix_01}/g" \
| sed -e "s/@enable_tiny_matrix@/${enable_tiny_matrix_01}/g" \
| sed -e "s/@enable_small_matrix@/${enable_small_matrix_01}/g" \
| sed -e "s/@enable_sup_handling@/${enable_sup_handling_01}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \
| sed -e "s/@enable_small_matrix_trsm@/${enable_small_matrix_trsm_01}/g" \
| sed -e "s/@enable_trsm_preinversion@/${enable_trsm_preinversion_01}/g" \
| sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \
| sed -e "s/@enable_aocl_dynamic@/${enable_aocl_dynamic_01}/g" \
| sed -e "s/@enable_pragma_omp_simd@/${enable_pragma_omp_simd_01}/g" \
| sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \

View File

@@ -53,4 +53,4 @@ err_t PASTEMAC( ch, tfuncname ) \
); \
GENTFUNC( scomplex, c, gemm_tiny )
GENTFUNC( dcomplex, z, gemm_tiny )
GENTFUNC( dcomplex, z, gemm_tiny )

View File

@@ -285,6 +285,7 @@ void PASTEF77S(ch,blasname) \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
IF_BLIS_ENABLE_MNK1_MATRIX(\
if( n0 == 1 ) \
{ \
if(bli_is_notrans(blis_transa)) \
@@ -357,6 +358,7 @@ void PASTEF77S(ch,blasname) \
bli_finalize_auto(); \
return; \
} \
) /* End of IF_BLIS_ENABLE_MNK1_MATRIX */ \
\
const num_t dt = PASTEMAC(ch,type); \
\

View File

@@ -304,6 +304,7 @@ void PASTEF77S(ch,blasname) \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
IF_BLIS_ENABLE_MNK1_MATRIX(\
if( n0 == 1 ) \
{ \
if(bli_is_notrans(blis_transa)) \
@@ -380,6 +381,7 @@ void PASTEF77S(ch,blasname) \
bli_finalize_auto(); \
return; \
} \
) /* End of IF_BLIS_ENABLE_MNK1_MATRIX */ \
\
const num_t dt = PASTEMAC(ch,type); \
\
@@ -534,6 +536,8 @@ void dgemm_blis_impl
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
#ifdef BLIS_ENABLE_MNK1_MATRIX
/* Call GEMV when m == 1 or n == 1 with the context set
to an uninitialized void pointer i.e. ((void *)0)*/
if (n0 == 1)
@@ -615,6 +619,8 @@ void dgemm_blis_impl
return;
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
@@ -665,6 +671,8 @@ void dgemm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
/*
Invoking the API for input sizes with k = 1.
- The API is single-threaded.
@@ -715,6 +723,9 @@ void dgemm_blis_impl
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
#ifdef BLIS_ENABLE_TINY_MATRIX
/**
*Early check for tiny sizes.
*if inputs are in range of tiny gemm kernel,
@@ -746,6 +757,7 @@ void dgemm_blis_impl
bli_finalize_auto();
return;
}
#endif // End of BLIS_ENABLE_TINY_MATRIX
const num_t dt = BLIS_DOUBLE;
@@ -859,8 +871,9 @@ void dgemm_blis_impl
}
}
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX
#ifdef BLIS_ENABLE_SUP_HANDLING
err_t sup_status = BLIS_FAILURE;
sup_status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if ( sup_status == BLIS_SUCCESS )
@@ -871,6 +884,7 @@ void dgemm_blis_impl
bli_finalize_auto();
return;
}
#endif // End of BLIS_ENABLE_SUP_HANDLING
// fall back on native path when dgemm is not handled in sup path.
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
@@ -1012,6 +1026,8 @@ void zgemm_blis_impl
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
#ifdef BLIS_ENABLE_MNK1_MATRIX
/* Call GEMV when m == 1 or n == 1 with the context set
to an uninitialized void pointer i.e. ((void *)0)*/
if (n0 == 1)
@@ -1093,6 +1109,8 @@ void zgemm_blis_impl
return;
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
@@ -1143,6 +1161,8 @@ void zgemm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
/*
Invoking the API for input sizes with k = 1.
- The API is single-threaded.
@@ -1240,6 +1260,11 @@ void zgemm_blis_impl
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
#ifdef BLIS_ENABLE_TINY_MATRIX
// May also be used in small path below
bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
// Tiny gemm dispatch
@@ -1271,6 +1296,8 @@ void zgemm_blis_impl
}
#endif
#endif // End of BLIS_ENABLE_TINY_MATRIX
const num_t dt = BLIS_DCOMPLEX;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -1297,6 +1324,11 @@ void zgemm_blis_impl
#ifdef BLIS_ENABLE_SMALL_MATRIX
/* Check if we have already defined this above */
#ifndef BLIS_ENABLE_TINY_MATRIX
bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
#endif
/* Query the architecture ID */
arch_t arch_id = bli_arch_query_id();
@@ -1389,8 +1421,9 @@ void zgemm_blis_impl
}
}
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX
#ifdef BLIS_ENABLE_SUP_HANDLING
err_t sup_status = BLIS_FAILURE;
sup_status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if ( sup_status == BLIS_SUCCESS )
@@ -1401,6 +1434,7 @@ void zgemm_blis_impl
bli_finalize_auto();
return;
}
#endif // End of BLIS_ENABLE_SUP_HANDLING
// fall back on native path when zgemm is not handled in sup path.
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
@@ -1555,6 +1589,8 @@ void cgemm_blis_impl
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
#ifdef BLIS_ENABLE_MNK1_MATRIX
/* Call GEMV when m == 1 or n == 1 with the context set
to an uninitialized void pointer i.e. ((void *)0)*/
if (n0 == 1)
@@ -1636,6 +1672,8 @@ void cgemm_blis_impl
return;
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
@@ -1686,6 +1724,8 @@ void cgemm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
/*
Invoking the API for input sizes with k = 1.
- The API is single-threaded.
@@ -1719,6 +1759,10 @@ void cgemm_blis_impl
}
#endif
#endif // End of BLIS_ENABLE_MNK1_MATRIX
#ifdef BLIS_ENABLE_TINY_MATRIX
bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel cgemm is invoked.
// Tiny gemm dispatch
@@ -1750,6 +1794,8 @@ void cgemm_blis_impl
}
#endif
#endif // End of BLIS_ENABLE_TINY_MATRIX
const num_t dt = BLIS_SCOMPLEX;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -1774,6 +1820,7 @@ void cgemm_blis_impl
bli_obj_set_conjtrans( blis_transa, &ao );
bli_obj_set_conjtrans( blis_transb, &bo );
#ifdef BLIS_ENABLE_SUP_HANDLING
err_t sup_status = BLIS_FAILURE;
sup_status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
if ( sup_status == BLIS_SUCCESS )
@@ -1784,6 +1831,7 @@ void cgemm_blis_impl
bli_finalize_auto();
return;
}
#endif // End of BLIS_ENABLE_SUP_HANDLING
// fall back on native path when cgemm is not handled in sup path.
//bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);

View File

@@ -317,6 +317,8 @@ void dgemv_blis_impl
return;
}
#ifdef BLIS_ENABLE_TINY_MATRIX
/**
* DGEMV Tiny Path
* If the matrix dimensions are within 8x8 then calculate the result
@@ -341,6 +343,8 @@ void dgemv_blis_impl
return;
}
#endif // End of BLIS_ENABLE_TINY_MATRIX
/* Call variants based on transpose value. */
if((bli_does_notrans(blis_transa) && bli_is_col_stored( rs_a, cs_a ))
|| (bli_does_trans(blis_transa) && bli_is_row_stored( rs_a, cs_a )))

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -247,6 +247,7 @@ void PASTEF77S(ch,blasname) \
/* If Transpose(A) uplo = higher then uplo = lower */ \
/* ----------------------------------------------------------- */ \
\
IF_BLIS_ENABLE_MNK1_MATRIX(\
if( n0 == 1 ) \
{ \
if( blis_side == BLIS_LEFT ) \
@@ -375,6 +376,7 @@ void PASTEF77S(ch,blasname) \
return; \
} \
} \
) /* End of IF_BLIS_ENABLE_MNK1_MATRIX */ \
\
const struc_t struca = BLIS_TRIANGULAR; \
\

View File

@@ -308,6 +308,7 @@ void PASTEF77S(ch,blasname) \
/* If Transpose(A) uplo = higher then uplo = lower */ \
/* ----------------------------------------------------------- */ \
\
IF_BLIS_ENABLE_MNK1_MATRIX(\
if( n0 == 1 ) \
{ \
if( blis_side == BLIS_LEFT ) \
@@ -442,6 +443,7 @@ void PASTEF77S(ch,blasname) \
return; \
} \
} \
) /* End of IF_BLIS_ENABLE_MNK1_MATRIX */ \
\
const struc_t struca = BLIS_TRIANGULAR; \
\
@@ -588,6 +590,8 @@ void strsm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
if( n0 == 1 )
{
if( blis_side == BLIS_LEFT )
@@ -732,6 +736,9 @@ void strsm_blis_impl
return;
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
const struc_t struca = BLIS_TRIANGULAR;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -789,7 +796,7 @@ void strsm_blis_impl
}
} // bli_cpuid_is_avx2fma3_supported
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX_TRSM
//bli_trsmnat
//(
@@ -932,6 +939,8 @@ void dtrsm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
if( n0 == 1 )
{
if( blis_side == BLIS_LEFT )
@@ -1077,6 +1086,8 @@ void dtrsm_blis_impl
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
const struc_t struca = BLIS_TRIANGULAR;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -1287,7 +1298,7 @@ void dtrsm_blis_impl
}
} // bli_cpuid_is_avx2fma3_supported
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX
//bli_trsmnat
//(
@@ -1431,6 +1442,8 @@ void ztrsm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
if( n0 == 1 )
{
if( blis_side == BLIS_LEFT )
@@ -1635,6 +1648,8 @@ void ztrsm_blis_impl
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
const struc_t struca = BLIS_TRIANGULAR;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -1810,7 +1825,7 @@ void ztrsm_blis_impl
}
} // bli_cpuid_is_avx2fma3_supported
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX
//bli_trsmnat
//(
@@ -1954,6 +1969,8 @@ void ctrsm_blis_impl
return;
}
#ifdef BLIS_ENABLE_MNK1_MATRIX
if( n0 == 1 )
{
if( blis_side == BLIS_LEFT )
@@ -2158,6 +2175,8 @@ void ctrsm_blis_impl
}
}
#endif // End of BLIS_ENABLE_MNK1_MATRIX
const struc_t struca = BLIS_TRIANGULAR;
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1;
@@ -2215,7 +2234,7 @@ void ctrsm_blis_impl
}
} // bli_cpuid_is_avx2fma3_supported
#endif //#ifdef BLIS_ENABLE_SMALL_MATRIX
#endif // End of BLIS_ENABLE_SMALL_MATRIX
//bli_trsmnat
//(

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2025, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -269,3 +269,21 @@
#endif
// -- CODE PATH ENABLEMENT --------------------------------------------------
#ifdef BLIS_ENABLE_MNK1_MATRIX
#define IF_BLIS_ENABLE_MNK1_MATRIX(...) __VA_ARGS__
#else
#define IF_BLIS_ENABLE_MNK1_MATRIX(...)
#endif
#ifdef BLIS_ENABLE_TINY_MATRIX
#define IF_BLIS_ENABLE_TINY_MATRIX(...) __VA_ARGS__
#else
#define IF_BLIS_ENABLE_TINY_MATRIX(...)
#endif
#ifdef BLIS_ENABLE_SMALL_MATRIX
#define IF_BLIS_ENABLE_SMALL_MATRIX(...) __VA_ARGS__
#else
#define IF_BLIS_ENABLE_SMALL_MATRIX(...)
#endif