diff --git a/build/bli_config.h.in b/build/bli_config.h.in index a9681d62a..a3b48ce2c 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -61,8 +61,22 @@ #define BLIS_ENABLE_JRIR_RR #endif -#if @enable_packbuf_pools@ -#define BLIS_ENABLE_PACKBUF_POOLS +#if @enable_pba_pools@ +#define BLIS_ENABLE_PBA_POOLS +#else +#define BLIS_DISABLE_PBA_POOLS +#endif + +#if @enable_sba_pools@ +#define BLIS_ENABLE_SBA_POOLS +#else +#define BLIS_DISABLE_SBA_POOLS +#endif + +#if @enable_mem_tracing@ +#define BLIS_ENABLE_MEM_TRACING +#else +#define BLIS_DISABLE_MEM_TRACING #endif #if @int_type_size@ == 64 diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index dc45be16a..736688481 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -42,10 +42,23 @@ bli_amaxv bli_amaxv_check bli_amaxv_ex bli_amaxv_ex_qfp +bli_apool_alloc_block +bli_apool_array_elem +bli_apool_checkin_array +bli_apool_checkout_array +bli_apool_finalize +bli_apool_free_block +bli_apool_grow +bli_apool_init bli_arch_query_id bli_arch_set_id bli_arch_set_id_once bli_arch_string +bli_array_elem +bli_array_finalize +bli_array_init +bli_array_resize +bli_array_set_elem bli_asumv bli_asumv_check bli_asumv_ex @@ -129,6 +142,7 @@ bli_ccopym_unb_var1 bli_ccopyv bli_ccopyv_ex bli_ccpackm_blk_var1_md +bli_ccpackm_cxk_1e_md bli_ccpackm_cxk_1r_md bli_ccpackm_struc_cxk_md bli_ccxpbym_md @@ -151,6 +165,7 @@ bli_cdotxf_ex bli_cdotxv bli_cdotxv_ex bli_cdpackm_blk_var1_md +bli_cdpackm_cxk_1e_md bli_cdpackm_cxk_1r_md bli_cdpackm_struc_cxk_md bli_cdxpbym_md @@ -240,6 +255,7 @@ bli_check_valid_datatype bli_check_valid_diag bli_check_valid_error_level bli_check_valid_kc_mod_mult +bli_check_valid_malloc_buf bli_check_valid_mc_mod_mult bli_check_valid_nc_mod_mult bli_check_valid_packbuf @@ -453,6 +469,7 @@ bli_csgemm_ker_var2_md bli_cshiftd bli_cshiftd_ex bli_cspackm_blk_var1_md +bli_cspackm_cxk_1e_md bli_cspackm_cxk_1r_md bli_cspackm_struc_cxk_md bli_csqrtsc @@ -556,6 +573,7 @@ bli_czcopysc bli_czgemm_ker_var2_md bli_czipsc bli_czpackm_blk_var1_md +bli_czpackm_cxk_1e_md bli_czpackm_cxk_1r_md bli_czpackm_struc_cxk_md bli_czxpbym_md @@ -605,6 +623,7 @@ bli_dcopym_unb_var1 bli_dcopyv bli_dcopyv_ex bli_dcpackm_blk_var1_md +bli_dcpackm_cxk_1e_md bli_dcpackm_cxk_1r_md bli_dcpackm_struc_cxk_md bli_dcxpbym_md @@ -631,6 +650,7 @@ bli_ddotxv bli_ddotxv_ex bli_ddotxv_zen_int bli_ddpackm_blk_var1_md +bli_ddpackm_cxk_1e_md bli_ddpackm_cxk_1r_md bli_ddpackm_struc_cxk_md bli_ddxpbym_md @@ -838,6 +858,7 @@ bli_dsgemm_ker_var2_md bli_dshiftd bli_dshiftd_ex bli_dspackm_blk_var1_md +bli_dspackm_cxk_1e_md bli_dspackm_cxk_1r_md bli_dspackm_struc_cxk_md bli_dsqrtsc @@ -946,6 +967,7 @@ bli_dzcopysc bli_dzgemm_ker_var2_md bli_dzipsc bli_dzpackm_blk_var1_md +bli_dzpackm_cxk_1e_md bli_dzpackm_cxk_1r_md bli_dzpackm_struc_cxk_md bli_dzxpbym_md @@ -958,11 +980,16 @@ bli_error_finalize bli_error_init bli_error_init_msgs bli_error_string_for_code +bli_ffree_align +bli_ffree_noalign bli_finalize bli_finalize_apis bli_finalize_auto bli_finalize_once bli_find_area_trap_l +bli_fmalloc_align +bli_fmalloc_align_check +bli_fmalloc_noalign bli_fprintm bli_fprintm_check bli_fprintm_ex @@ -971,9 +998,7 @@ bli_fprintv bli_fprintv_check bli_fprintv_ex bli_fprintv_qfp -bli_free_align bli_free_intl -bli_free_noalign bli_free_pool bli_free_user bli_func_create @@ -1187,9 +1212,10 @@ bli_info_get_enable_blas bli_info_get_enable_cblas bli_info_get_enable_memkind bli_info_get_enable_openmp -bli_info_get_enable_packbuf_pools +bli_info_get_enable_pba_pools bli_info_get_enable_pthreads bli_info_get_enable_sandbox +bli_info_get_enable_sba_pools bli_info_get_enable_stay_auto_init bli_info_get_enable_threading bli_info_get_gemm_impl_string @@ -1264,7 +1290,7 @@ bli_l1v_xi_check bli_l1v_xy_check bli_l3_basic_check bli_l3_cntl_create_if -bli_l3_cntl_free_if +bli_l3_cntl_free bli_l3_determine_kc bli_l3_direct bli_l3_ind_oper_enable_only @@ -1289,17 +1315,13 @@ bli_l3_thrinfo_print_paths bli_lcm bli_lsame bli_machval -bli_malloc_align -bli_malloc_align_check bli_malloc_intl -bli_malloc_noalign bli_malloc_pool bli_malloc_user bli_mbool_create bli_mbool_free bli_mbool_init bli_membrk_acquire_m -bli_membrk_acquire_v bli_membrk_compute_pool_block_sizes bli_membrk_compute_pool_block_sizes_dt bli_membrk_finalize @@ -1307,9 +1329,10 @@ bli_membrk_finalize_pools bli_membrk_init bli_membrk_init_pools bli_membrk_pool_size +bli_membrk_query bli_membrk_release +bli_membrk_rntm_set_membrk bli_memsys_finalize -bli_memsys_global_membrk bli_memsys_init bli_mkherm bli_mkherm_check @@ -1448,11 +1471,16 @@ bli_prune_unref_mparts bli_pthread_barrier_destroy bli_pthread_barrier_init bli_pthread_barrier_wait +bli_pthread_cond_broadcast +bli_pthread_cond_destroy +bli_pthread_cond_init +bli_pthread_cond_wait bli_pthread_create bli_pthread_join bli_pthread_mutex_destroy bli_pthread_mutex_init bli_pthread_mutex_lock +bli_pthread_mutex_trylock bli_pthread_mutex_unlock bli_pthread_once bli_randm @@ -1505,6 +1533,14 @@ bli_saxpyv bli_saxpyv_ex bli_saxpyv_zen_int bli_saxpyv_zen_int10 +bli_sba_acquire +bli_sba_checkin_array +bli_sba_checkout_array +bli_sba_finalize +bli_sba_init +bli_sba_query +bli_sba_release +bli_sba_rntm_set_pool bli_scal2d bli_scal2d_check bli_scal2d_ex @@ -1523,7 +1559,6 @@ bli_scald_ex bli_scald_ex_qfp bli_scalm bli_scalm_check -bli_scalm_cntl_create_node bli_scalm_ex bli_scalm_ex_qfp bli_scalv @@ -1543,6 +1578,7 @@ bli_scopym_unb_var1 bli_scopyv bli_scopyv_ex bli_scpackm_blk_var1_md +bli_scpackm_cxk_1e_md bli_scpackm_cxk_1r_md bli_scpackm_struc_cxk_md bli_scxpbym_md @@ -1569,6 +1605,7 @@ bli_sdotxv bli_sdotxv_ex bli_sdotxv_zen_int bli_sdpackm_blk_var1_md +bli_sdpackm_cxk_1e_md bli_sdpackm_cxk_1r_md bli_sdpackm_struc_cxk_md bli_sdxpbym_md @@ -1780,6 +1817,7 @@ bli_ssgemm_ker_var2_md bli_sshiftd bli_sshiftd_ex bli_sspackm_blk_var1_md +bli_sspackm_cxk_1e_md bli_sspackm_cxk_1r_md bli_sspackm_struc_cxk_md bli_ssqrtsc @@ -1955,6 +1993,7 @@ bli_szcopysc bli_szgemm_ker_var2_md bli_szipsc bli_szpackm_blk_var1_md +bli_szpackm_cxk_1e_md bli_szpackm_cxk_1r_md bli_szpackm_struc_cxk_md bli_szxpbym_md @@ -1997,6 +2036,7 @@ bli_thread_set_ways bli_thread_set_ways_ bli_thrinfo_create bli_thrinfo_create_for_cntl +bli_thrinfo_free bli_thrinfo_grow bli_thrinfo_init bli_thrinfo_init_single @@ -2166,6 +2206,7 @@ bli_zcopym_unb_var1 bli_zcopyv bli_zcopyv_ex bli_zcpackm_blk_var1_md +bli_zcpackm_cxk_1e_md bli_zcpackm_cxk_1r_md bli_zcpackm_struc_cxk_md bli_zcxpbym_md @@ -2188,6 +2229,7 @@ bli_zdotxf_ex bli_zdotxv bli_zdotxv_ex bli_zdpackm_blk_var1_md +bli_zdpackm_cxk_1e_md bli_zdpackm_cxk_1r_md bli_zdpackm_struc_cxk_md bli_zdxpbym_md @@ -2377,6 +2419,7 @@ bli_zsgemm_ker_var2_md bli_zshiftd bli_zshiftd_ex bli_zspackm_blk_var1_md +bli_zspackm_cxk_1e_md bli_zspackm_cxk_1r_md bli_zspackm_struc_cxk_md bli_zsqrtsc @@ -2480,6 +2523,7 @@ bli_zzcopysc bli_zzgemm_ker_var2_md bli_zzipsc bli_zzpackm_blk_var1_md +bli_zzpackm_cxk_1e_md bli_zzpackm_cxk_1r_md bli_zzpackm_struc_cxk_md bli_zzxpbym_md diff --git a/configure b/configure index 5f87a2f0e..ba05cc2c8 100755 --- a/configure +++ b/configure @@ -148,20 +148,37 @@ print_usage() echo " --disable-threading is specified, threading will be" echo " disabled. The default is 'no'." echo " " - echo " --disable-packbuf-pools, --enable-packbuf-pools" + echo " --disable-pba-pools, --enable-pba-pools" + echo " --disable-sba-pools, --enable-sba-pools" echo " " - echo " Disable (enabled by default) use of internal memory" - echo " pools for managing packing buffers. When disabled," - echo " the function specified by BLIS_MALLOC_POOL is called" - echo " on-demand, whenever a packing buffer is needed, and" - echo " the buffer is released via the function specified by" - echo " BLIS_FREE_POOL() when the loop in which it was" - echo " allocated terminates. When enabled, the memory pools" - echo " minimize calls to both BLIS_MALLOC_POOL() and" - echo " BLIS_FREE_POOL(), especially in a multithreaded" - echo " environment, but does so through a mechanism that may" - echo " incur additional overhead in some (but not all)" - echo " situations." + echo " Disable (enabled by default) use of internal memory pools" + echo " within the packing block allocator (pba) and/or the small" + echo " block allocator (sba). The former is used to allocate" + echo " memory used to pack submatrices while the latter is used" + echo " to allocate control/thread tree nodes and thread" + echo " communicators. Both allocations take place in the context" + echo " of level-3 operations. When the pba is disabled, the" + echo " malloc()-like function specified by BLIS_MALLOC_POOL is" + echo " called on-demand whenever a packing block is needed, and" + echo " when the sba is disabled, the malloc()-like function" + echo " specified by BLIS_MALLOC_INTL is called whenever a small" + echo " block is needed, with the two allocators calling free()-" + echo " like functions BLIS_FREE_POOL and BLIS_FREE_INTL," + echo " respectively when blocks are released. When enabled," + echo " either or both pools are populated via the same functions" + echo " mentioned previously, and henceforth blocks are checked" + echo " out and in. The library quickly reaches a state in which" + echo " it no longer needs to call malloc() or free(), even" + echo " across many separate level-3 operation invocations." + echo " " + echo " " + echo " --enable-mem-tracing, --disable-mem-tracing" + echo " " + echo " Enable (disable by default) output to stdout that traces" + echo " the allocation and freeing of memory, including the names" + echo " of the functions that triggered the allocation/freeing." + echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." + echo " Please use only for informational/debugging purposes." echo " " echo " -i SIZE, --int-size=SIZE" echo " " @@ -1720,7 +1737,9 @@ main() enable_arg_max_hack='no' enable_static='yes' enable_shared='yes' - enable_packbuf_pools='yes' + enable_pba_pools='yes' + enable_sba_pools='yes' + enable_mem_tracing='no' int_type_size=0 blas_int_type_size=32 enable_blas='yes' @@ -1837,11 +1856,23 @@ main() disable-threading) threading_model='no' ;; - enable-packbuf-pools) - enable_packbuf_pools='yes' + enable-pba-pools) + enable_pba_pools='yes' ;; - disable-packbuf-pools) - enable_packbuf_pools='no' + disable-pba-pools) + enable_pba_pools='no' + ;; + enable-sba-pools) + enable_sba_pools='yes' + ;; + disable-sba-pools) + enable_sba_pools='no' + ;; + enable-mem-tracing) + enable_mem_tracing='yes' + ;; + disable-mem-tracing) + enable_mem_tracing='no' ;; enable-sandbox=*) sandbox_flag=1 @@ -2549,12 +2580,26 @@ main() fi # Convert 'yes' and 'no' flags to booleans. - if [ "x${enable_packbuf_pools}" = "xyes" ]; then - echo "${script_name}: internal memory pools for packing buffers are enabled." - enable_packbuf_pools_01=1 + if [ "x${enable_pba_pools}" = "xyes" ]; then + echo "${script_name}: internal memory pools for packing blocks are enabled." + enable_pba_pools_01=1 else - echo "${script_name}: internal memory pools for packing buffers are disabled." - enable_packbuf_pools_01=0 + echo "${script_name}: internal memory pools for packing blocks are disabled." + enable_pba_pools_01=0 + fi + if [ "x${enable_sba_pools}" = "xyes" ]; then + echo "${script_name}: internal memory pools for small blocks are enabled." + enable_sba_pools_01=1 + else + echo "${script_name}: internal memory pools for small blocks are disabled." + enable_sba_pools_01=0 + fi + if [ "x${enable_mem_tracing}" = "xyes" ]; then + echo "${script_name}: memory tracing output is enabled." + enable_mem_tracing_01=1 + else + echo "${script_name}: memory tracing output is disabled." + enable_mem_tracing_01=0 fi if [ "x${has_memkind}" = "xyes" ]; then if [ "x${enable_memkind}" = "x" ]; then @@ -2809,7 +2854,9 @@ main() | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ - | sed -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \ + | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ + | sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \ + | sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \ | sed -e "s/@int_type_size@/${int_type_size}/g" \ | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ | sed -e "s/@enable_blas@/${enable_blas_01}/g" \ diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index fec4ff964..1e782cc68 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -66,6 +66,3 @@ #include "bli_packm.h" #include "bli_unpackm.h" -// Other -#include "bli_scalm.h" - diff --git a/frame/1m/scalm/bli_scalm.h b/frame/1m/other/bli_scalm.h similarity index 100% rename from frame/1m/scalm/bli_scalm.h rename to frame/1m/other/bli_scalm.h diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/other/bli_scalm_cntl.c similarity index 100% rename from frame/1m/scalm/bli_scalm_cntl.c rename to frame/1m/other/bli_scalm_cntl.c diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/other/bli_scalm_cntl.h similarity index 100% rename from frame/1m/scalm/bli_scalm_cntl.h rename to frame/1m/other/bli_scalm_cntl.h diff --git a/frame/1m/scalm/other/bli_scalm_int.c b/frame/1m/other/bli_scalm_int.c similarity index 100% rename from frame/1m/scalm/other/bli_scalm_int.c rename to frame/1m/other/bli_scalm_int.c diff --git a/frame/1m/scalm/other/bli_scalm_int.h b/frame/1m/other/bli_scalm_int.h similarity index 100% rename from frame/1m/scalm/other/bli_scalm_int.h rename to frame/1m/other/bli_scalm_int.h diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index 084bad458..12083f3be 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -37,6 +37,7 @@ cntl_t* bli_packm_cntl_create_node ( + rntm_t* rntm, void* var_func, void* packm_var_func, bszid_t bmid_m, @@ -52,12 +53,12 @@ cntl_t* bli_packm_cntl_create_node cntl_t* cntl; packm_params_t* params; - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_packm_cntl_create_node(): " ); #endif // Allocate a packm_params_t struct. - params = bli_malloc_intl( sizeof( packm_params_t ) ); + params = bli_sba_acquire( rntm, sizeof( packm_params_t ) ); // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); @@ -70,7 +71,7 @@ cntl_t* bli_packm_cntl_create_node params->pack_schema = pack_schema; params->pack_buf_type = pack_buf_type; - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_packm_cntl_create_node(): " ); #endif @@ -80,6 +81,7 @@ cntl_t* bli_packm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( + rntm, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 4d3468861..e7a8511c7 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -90,6 +90,7 @@ static packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) cntl_t* bli_packm_cntl_create_node ( + rntm_t* rntm, void* var_func, void* packm_var_func, bszid_t bmid_m, diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index ceb8da2f3..4fb3fcd31 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -34,32 +34,6 @@ #include "blis.h" -#if 0 -thrinfo_t* bli_packm_thrinfo_create - ( - thrcomm_t* ocomm, - dim_t ocomm_id, - dim_t n_way, - dim_t work_id, - thrinfo_t* sub_node - ) -{ - thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); - - bli_thrinfo_init - ( - thread, - ocomm, ocomm_id, - n_way, - work_id, - FALSE, - sub_node - ); - - return thread; -} -#endif - void bli_packm_thrinfo_init ( thrinfo_t* thread, @@ -95,14 +69,3 @@ void bli_packm_thrinfo_init_single ); } -#if 0 -void bli_packm_thrinfo_free - ( - thrinfo_t* thread - ) -{ - if ( thread != NULL && - thread != &BLIS_PACKM_SINGLE_THREADED ) - bli_free_intl( thread ); -} -#endif diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 36af5f7aa..ac607bbe2 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -36,6 +36,7 @@ cntl_t* bli_unpackm_cntl_create_node ( + rntm_t* rntm, void* var_func, void* unpackm_var_func, cntl_t* sub_node @@ -44,6 +45,10 @@ cntl_t* bli_unpackm_cntl_create_node cntl_t* cntl; unpackm_params_t* params; + // NOTE: If this function is ever called, figure out whether the + // bli_malloc_intl() below needs to be changed to bli_sba_acquire(). + bli_abort(); + // Allocate an unpackm_params_t struct. params = bli_malloc_intl( sizeof( unpackm_params_t ) ); @@ -57,6 +62,7 @@ cntl_t* bli_unpackm_cntl_create_node // sync with the cntl_t tree. cntl = bli_cntl_create_node ( + rntm, BLIS_NOID, BLIS_NO_PART, var_func, diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index cf8dce5fe..c258eafd7 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -47,6 +47,7 @@ typedef struct unpackm_params_s unpackm_params_t; cntl_t* bli_unpackm_cntl_create_node ( + rntm_t* rntm, void* var_func, void* unpackm_var_func, cntl_t* sub_node diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 89998664a..efdca53db 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -44,27 +44,11 @@ void bli_l3_cntl_create_if obj_t* a, obj_t* b, obj_t* c, + rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ) { - // This is part of a hack to support mixed domain in bli_gemm_front(). - // Sometimes we need to specify a non-standard schema for A and B, and - // we decided to transmit them via the schema field in the obj_t's - // rather than pass them in as function parameters. Once the values - // have been read, we immediately reset them back to their expected - // values for unpacked objects. Notice that we do this even if the - // caller passed in a custom control tree; that's because we still need - // to reset the pack schema of a and b, which were modified by the - // operation's _front() function. However, in order for this to work, - // the level-3 thread entry function (or omp parallel region) must - // alias thread-local copies of objects a and b. - //pack_t schema_a = bli_obj_pack_schema( a ); - //pack_t schema_b = bli_obj_pack_schema( b ); - - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); - //bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); - // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) @@ -73,7 +57,7 @@ void bli_l3_cntl_create_if family == BLIS_HERK || family == BLIS_TRMM ) { - *cntl_use = bli_gemm_cntl_create( family, schema_a, schema_b ); + *cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b ); } else // if ( family == BLIS_TRSM ) { @@ -82,7 +66,7 @@ void bli_l3_cntl_create_if if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; - *cntl_use = bli_trsm_cntl_create( side, schema_a, schema_b ); + *cntl_use = bli_trsm_cntl_create( rntm, side, schema_a, schema_b ); } } else @@ -90,7 +74,7 @@ void bli_l3_cntl_create_if // If the user provided a control tree, create a copy and use it // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). - *cntl_use = bli_cntl_copy( cntl_orig ); + *cntl_use = bli_cntl_copy( rntm, cntl_orig ); // Recursively set the family fields of the newly copied control tree // nodes. @@ -100,6 +84,7 @@ void bli_l3_cntl_create_if void bli_l3_cntl_free ( + rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ) @@ -115,11 +100,11 @@ void bli_l3_cntl_free family == BLIS_HERK || family == BLIS_TRMM ) { - bli_gemm_cntl_free( cntl_use, thread ); + bli_gemm_cntl_free( rntm, cntl_use, thread ); } else // if ( family == BLIS_TRSM ) { - bli_trsm_cntl_free( cntl_use, thread ); + bli_trsm_cntl_free( rntm, cntl_use, thread ); } } diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h index 8642d8e2d..0c04f348c 100644 --- a/frame/3/bli_l3_cntl.h +++ b/frame/3/bli_l3_cntl.h @@ -46,12 +46,14 @@ void bli_l3_cntl_create_if obj_t* a, obj_t* b, obj_t* c, + rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( + rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 982af3d1d..bfb066bfb 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -45,7 +45,6 @@ void bli_l3_packm thrinfo_t* thread ) { - membrk_t* membrk; packbuf_t pack_buf_type; mem_t* cntl_mem_p; siz_t size_needed; @@ -70,9 +69,6 @@ void bli_l3_packm // return early. if ( size_needed == 0 ) return; - // Query the memory broker from the context. - membrk = bli_cntx_get_membrk( cntx ); - // Query the pack buffer type from the control tree node. pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); @@ -89,7 +85,7 @@ void bli_l3_packm if ( bli_thread_am_ochief( thread ) ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_packm(): acquiring mem pool block\n" ); #endif @@ -97,7 +93,7 @@ void bli_l3_packm // and saves the associated mem_t entry to local_mem_s. bli_membrk_acquire_m ( - membrk, + rntm, size_needed, pack_buf_type, &local_mem_s @@ -134,10 +130,14 @@ void bli_l3_packm // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. - bli_membrk_release( cntl_mem_p ); + bli_membrk_release + ( + rntm, + cntl_mem_p + ); bli_membrk_acquire_m ( - membrk, + rntm, size_needed, pack_buf_type, &local_mem_s diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index e67f3c407..5a6228609 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -88,37 +88,11 @@ void bli_l3_thrinfo_init_single void bli_l3_thrinfo_free ( + rntm_t* rntm, thrinfo_t* thread ) { - if ( thread == NULL || - thread == &BLIS_PACKM_SINGLE_THREADED || - thread == &BLIS_GEMM_SINGLE_THREADED - ) return; - - thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); - - // Free the communicators, but only if the current thrinfo_t struct - // is marked as needing them to be freed. The most common example of - // thrinfo_t nodes NOT marked as needing their comms freed are those - // associated with packm thrinfo_t nodes. - if ( bli_thrinfo_needs_free_comm( thread ) ) - { - // The ochief always frees his communicator, and the ichief free its - // communicator if we are at the leaf node. - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); - } - - // Free all children of the current thrinfo_t. - bli_l3_thrinfo_free( thrinfo_sub_node ); - - #ifdef ENABLE_MEM_DEBUG - printf( "bli_l3_thrinfo_free(): " ); - #endif - - // Free the thrinfo_t struct. - bli_free_intl( thread ); + bli_thrinfo_free( rntm, thread ); } // ----------------------------------------------------------------------------- @@ -149,6 +123,7 @@ void bli_l3_thrinfo_create_root // Create the root thrinfo_t node. *thread = bli_thrinfo_create ( + rntm, gl_comm, gl_comm_id, xx_way, @@ -348,6 +323,7 @@ void bli_l3_thrinfo_print_paths void bli_l3_thrinfo_free_paths ( + rntm_t* rntm, thrinfo_t** threads ) { @@ -355,7 +331,7 @@ void bli_l3_thrinfo_free_paths dim_t i; for ( i = 0; i < n_threads; ++i ) - bli_l3_thrinfo_free( threads[i] ); + bli_l3_thrinfo_free( rntm, threads[i] ); bli_free_intl( threads ); } diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 773fa64c4..72a4f77e2 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -89,6 +89,7 @@ void bli_l3_thrinfo_init_single void bli_l3_thrinfo_free ( + rntm_t* rntm, thrinfo_t* thread ); @@ -112,6 +113,7 @@ void bli_l3_thrinfo_print_paths void bli_l3_thrinfo_free_paths ( + rntm_t* rntm, thrinfo_t** threads ); diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 103446006..67c71e798 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -37,21 +37,23 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + opid_t family, + pack_t schema_a, + pack_t schema_b ) { - return bli_gemmbp_cntl_create( family, schema_a, schema_b ); + return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + opid_t family, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_fp; @@ -71,6 +73,7 @@ cntl_t* bli_gemmbp_cntl_create // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( + rntm, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -79,6 +82,7 @@ cntl_t* bli_gemmbp_cntl_create cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( + rntm, // the thread's runtime structure family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_fp, @@ -88,6 +92,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( + rntm, bli_gemm_packa, // pack the left-hand operand packa_fp, BLIS_MR, @@ -103,6 +108,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( + rntm, family, BLIS_MC, bli_gemm_blk_var1, @@ -112,6 +118,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( + rntm, bli_gemm_packb, // pack the right-hand operand packb_fp, BLIS_KR, @@ -127,6 +134,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( + rntm, family, BLIS_KC, bli_gemm_blk_var3, @@ -136,6 +144,7 @@ cntl_t* bli_gemmbp_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( + rntm, family, BLIS_NC, bli_gemm_blk_var2, @@ -246,23 +255,25 @@ cntl_t* bli_gemmpb_cntl_create void bli_gemm_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ) { - bli_cntl_free( cntl, thread ); + bli_cntl_free( rntm, cntl, thread ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 8ca2104b0..babb245f7 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -34,18 +34,20 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + opid_t family, + pack_t schema_a, + pack_t schema_b ); #if 0 @@ -59,7 +61,8 @@ cntl_t* bli_gemmpb_cntl_create void bli_gemm_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ); @@ -67,6 +70,7 @@ void bli_gemm_cntl_free cntl_t* bli_gemm_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 6a2d568bb..8ae6f6da2 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -37,21 +37,23 @@ cntl_t* bli_trsm_cntl_create ( - side_t side, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + side_t side, + pack_t schema_a, + pack_t schema_b ) { if ( bli_is_left( side ) ) - return bli_trsm_l_cntl_create( schema_a, schema_b ); + return bli_trsm_l_cntl_create( rntm, schema_a, schema_b ); else - return bli_trsm_r_cntl_create( schema_a, schema_b ); + return bli_trsm_r_cntl_create( rntm, schema_a, schema_b ); } cntl_t* bli_trsm_l_cntl_create ( - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p; @@ -70,6 +72,7 @@ cntl_t* bli_trsm_l_cntl_create // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + rntm, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -78,6 +81,7 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + rntm, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -87,6 +91,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( + rntm, bli_trsm_packa, packa_fp, BLIS_MR, @@ -102,6 +107,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + rntm, family, BLIS_MC, bli_trsm_blk_var1, @@ -111,6 +117,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( + rntm, bli_trsm_packb, packb_fp, BLIS_MR, @@ -126,6 +133,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + rntm, family, BLIS_KC, bli_trsm_blk_var3, @@ -135,6 +143,7 @@ cntl_t* bli_trsm_l_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + rntm, family, BLIS_NC, bli_trsm_blk_var2, @@ -146,8 +155,9 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + pack_t schema_a, + pack_t schema_b ) { // NOTE: trsm macrokernels are presently disabled for right-side execution. @@ -161,6 +171,7 @@ cntl_t* bli_trsm_r_cntl_create // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( + rntm, family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used @@ -169,6 +180,7 @@ cntl_t* bli_trsm_r_cntl_create cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( + rntm, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, @@ -178,6 +190,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( + rntm, bli_trsm_packa, packa_fp, BLIS_NR, @@ -193,6 +206,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the m dimension by MC. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( + rntm, family, BLIS_MC, bli_trsm_blk_var1, @@ -202,6 +216,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( + rntm, bli_trsm_packb, packb_fp, BLIS_MR, @@ -217,6 +232,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( + rntm, family, BLIS_KC, bli_trsm_blk_var3, @@ -226,6 +242,7 @@ cntl_t* bli_trsm_r_cntl_create // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( + rntm, family, BLIS_NC, bli_trsm_blk_var2, @@ -237,23 +254,25 @@ cntl_t* bli_trsm_r_cntl_create void bli_trsm_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ) { - bli_cntl_free( cntl, thread ); + bli_cntl_free( rntm, cntl, thread ); } // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, cntl_t* sub_node ) { - return bli_cntl_create_node( family, bszid, var_func, NULL, sub_node ); + return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 25d50fe80..7769d2674 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -34,26 +34,30 @@ cntl_t* bli_trsm_cntl_create ( - side_t side, - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + side_t side, + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_l_cntl_create ( - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_r_cntl_create ( - pack_t schema_a, - pack_t schema_b + rntm_t* rntm, + pack_t schema_a, + pack_t schema_b ); void bli_trsm_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ); @@ -61,6 +65,7 @@ void bli_trsm_cntl_free cntl_t* bli_trsm_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c new file mode 100644 index 000000000..542c4275a --- /dev/null +++ b/frame/base/bli_apool.c @@ -0,0 +1,563 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_apool_init + ( + malloc_ft malloc_fp, + free_ft free_fp, + apool_t* restrict apool + ) +{ + // Query the mutex from the apool_t. + bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + + // Initialize the mutex. + //*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; + bli_pthread_mutex_init( mutex, NULL ); + + // We choose to start with: + // - an empty pool + // - an initial block_ptrs_len of 8 + // - a single element in each initial array_t (though this is moot with + // num_blocks = 0). + const siz_t num_blocks = 0; + siz_t block_ptrs_len = 8; + const siz_t num_elem = 1; + + // NOTE: Unlike in the bli_pool API, apool_t allocates block_ptrs as an + // array of array_t* instead of an array of pblk_t. Why? We don't need to + // track the size of each block, thus we don't need the block_size field + // of pblk_t. That leaves only the void* field, and since we know apool_t + // will always contain "blocks" that are really array_t structs, we can + // make block_ptrs an array of array_t*. + + // We formally set the block_size and align_size fields of the underlying + // pool, even though they won't be queried. (They are used from hard-coded + // values in bli_apool_alloc_block().) + const siz_t block_size = sizeof( array_t ); + const siz_t align_size = 64; + + // Query the underlying pool_t from the apool_t. + pool_t* restrict pool = bli_apool_pool( apool ); + + // Set the default array_t length of the apool_t. + bli_apool_set_def_array_len( num_elem, apool ); + + // ------------------------------------------------------------------------- + + // Make sure that block_ptrs_len is at least num_blocks. + block_ptrs_len = bli_max( block_ptrs_len, num_blocks ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_init(): allocating block_ptrs (length %d): ", + ( int )block_ptrs_len ); + #endif + + // Allocate the block_ptrs array. + array_t** restrict block_ptrs + = + bli_malloc_intl( block_ptrs_len * sizeof( array_t* ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_init(): allocating %d array_t.\n", ( int )num_blocks ); + fflush( stdout ); + #endif + + // Allocate and initialize each entry in the block_ptrs array. + for ( dim_t i = 0; i < num_blocks; ++i ) + { + // Pass in num_elem so the function knows how many elements to + // initially have in each array_t. + bli_apool_alloc_block + ( + num_elem, + malloc_fp, + &(block_ptrs[i]) + ); + } + + // NOTE: The semantics of top_index approximate a stack, where a "full" + // stack (no blocks checked out) is one where top_index == 0 and an empty + // stack (all blocks checked out) one where top_index == num_blocks. + // (Here, num_blocks tracks the number of blocks currently allocated as + // part of the pool.) This "orientation" of the stack was chosen + // intentionally, in contrast to one where top_index == -1 means the + // stack is empty and top_index = num_blocks - 1 means the stack is + // full. The chosen scheme allows one to conceptualize the stack as a + // number line in which blocks are checked out from lowest to highest, + // and additional blocks are added at the higher end. + + // Initialize the pool_t structure. + bli_pool_set_block_ptrs( block_ptrs, pool ); + bli_pool_set_block_ptrs_len( block_ptrs_len, pool ); + bli_pool_set_top_index( 0, pool ); + bli_pool_set_num_blocks( num_blocks, pool ); + bli_pool_set_block_size( block_size, pool ); + bli_pool_set_align_size( align_size, pool ); + bli_pool_set_malloc_fp( malloc_fp, pool ); + bli_pool_set_free_fp( free_fp, pool ); +} + +void bli_apool_alloc_block + ( + siz_t num_elem, + malloc_ft malloc_fp, + array_t** restrict array_p + ) +{ + // Since the apool_t is defined as a pool of array_t, we can hard-code + // the block_size and align_size parameters. For the align_size, we + // use the size of a cache line. + const siz_t block_size = sizeof( array_t ); + //const siz_t align_size = 64; + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_alloc_block(): allocating array_t: " ); + #endif + + // Allocate the array_t via the bli_fmalloc_align() wrapper, which performs + // alignment logic and opaquely saves the original pointer so that it can + // be recovered when it's time to free the block. + array_t* restrict array + = + //bli_fmalloc_align( malloc_fp, block_size, align_size ); + bli_malloc_intl( block_size ); + + // Initialize an array_t struct within the newly allocated memory region. + bli_array_init( num_elem, sizeof( pool_t* ), array ); + + // Save the pointer in the caller's array_t*. + *array_p = array; +} + +void bli_apool_free_block + ( + free_ft free_fp, + array_t* restrict array + ) +{ + const siz_t num_elem = bli_array_num_elem( array ); + pool_t** restrict buf = bli_array_buf( array ); + + // Step through the array and finalize each pool_t. + for ( dim_t i = 0; i < num_elem; ++i ) + { + pool_t* restrict pool = buf[ i ]; + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n", + ( int )i ); + fflush( stdout ); + #endif + + // Finalize and free the current pool_t, if it was created/allocated. + if ( pool != NULL ) + { + // Finalize the pool. + bli_pool_finalize( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_free_block(): pool_t %d: ", ( int )i ); + #endif + + // Free the pool_t struct. + bli_free_intl( pool ); + } + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_free_block(): " ); + #endif + + // Free the array buffer. + bli_array_finalize( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_free_block(): freeing array_t: " ); + #endif + + // Free the array. + //bli_ffree_align( free_fp, array ); + bli_free_intl( array ); +} + +void bli_apool_finalize + ( + apool_t* restrict apool + ) +{ + // Query the mutex from the apool_t. + bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); + + // Destroy the mutex. + bli_pthread_mutex_destroy( mutex ); + + // Query the underlying pool_t and mutex from the apool_t. + pool_t* restrict pool = bli_apool_pool( apool ); + + // ---------------------------------------------------------------- + + // Query the block_ptrs array. + array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the total number of blocks currently allocated. + siz_t num_blocks = bli_pool_num_blocks( pool ); + + // Query the top_index of the pool. + siz_t top_index = bli_pool_top_index( pool ); + + // Sanity check: The top_index should be zero. + if ( top_index != 0 ) bli_abort(); + + // Query the free() function pointer for the pool. + free_ft free_fp = bli_pool_free_fp( pool ); + + // Free the individual blocks (each an array_t) currently in the pool. + for ( dim_t i = 0; i < num_blocks; ++i ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_finalize(): freeing array_t %d within apool_t.\n", + ( int )i ); + fflush( stdout ); + #endif + + bli_apool_free_block( free_fp, block_ptrs[i] ); + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_finalize(): freeing block_ptrs (length %d): ", + ( int )( bli_pool_block_ptrs_len( pool ) ) ); + #endif + + // Free the block_ptrs array. + bli_free_intl( block_ptrs ); +} + +array_t* bli_apool_checkout_array + ( + siz_t n_threads, + apool_t* restrict apool + ) +{ + // Acquire the apool_t's mutex. + bli_apool_lock( apool ); + + // ---------------------------------------------------------------------------- + + // NOTE: Unlike with the bli_pool API, we do not need to handle potential + // reinitialization since the apool_t's block_size (corresponding to the + // size of an array_t struct) will never grow. + + // If the apool_t is exhausted, add a block (e.g. an array_t). + if ( bli_apool_is_exhausted( apool ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_checkout_block(): apool_t is exhausted; " + "growing by 1 array_t.\n" ); + fflush( stdout ); + #endif + + bli_apool_grow( 1, apool ); + } + + // At this point, at least one array_t is guaranteed to be available. + + // Query the underlying pool_t from the apool_t. + pool_t* restrict pool = bli_apool_pool( apool ); + + // Query the block_ptrs array. + array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the top_index of the pool. + const siz_t top_index = bli_pool_top_index( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_checkout_array(): checking out array_t %d.\n", + ( int )top_index ); + fflush( stdout ); + #endif + + // Select the array_t* at top_index to return to the caller. + array_t* restrict array = block_ptrs[ top_index ]; + + // Increment the pool's top_index. + bli_pool_set_top_index( top_index + 1, pool ); + + // ---------------------------------------------------------------------------- + + // Release the apool_t's mutex. + bli_apool_unlock( apool ); + + // Resize the array_t according to the number of threads specified by the + // caller. (We need one element in the array_t per thread.) + bli_array_resize( n_threads, array ); + + // Return the selected array_t*. + return array; +} + +void bli_apool_checkin_array + ( + array_t* restrict array, + apool_t* restrict apool + ) +{ + // Acquire the apool_t's mutex. + bli_apool_lock( apool ); + + // Query the underlying pool_t from the apool_t. + pool_t* restrict pool = bli_apool_pool( apool ); + + // ---------------------------------------------------------------------------- + + // NOTE: Unlike with the bli_pool API, we do not need to handle potential + // freeing of the blocks upon checkin due to the block_size having since + // changed due to reinitialization since the apool's block_size will never + // change. + + // Query the block_ptrs array. + array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the top_index of the pool. + const siz_t top_index = bli_pool_top_index( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_checkin_block(): checking in array_t %d.\n", + ( int )top_index - 1 ); + fflush( stdout ); + #endif + + // Copy the caller's array_t address to the element at top_index - 1. + block_ptrs[ top_index - 1 ] = array; + + // Decrement the pool's top_index. + bli_pool_set_top_index( top_index - 1, pool ); + + // ---------------------------------------------------------------------------- + + // Release the apool_t's mutex. + bli_apool_unlock( apool ); +} + +pool_t* bli_apool_array_elem + ( + siz_t index, + array_t* restrict array + ) +{ + // Query the array element corresponding to index. + // NOTE: If we knew that the array_t contained elements of size + // sizeof( void* ) or sizeof( whatever ), we could return the *value* + // stored in the array. But since array_t is general-purpose, it can't + // return the element itself. So instead, bli_array_elem() returns the + // address of the element in the array. Since the elements that apool_t + // stores in the array_t are pool_t*, that means that the function is + // actually returning the address of a pool_t*, or pool_t**, hence the + // dereferencing below. + pool_t** restrict pool_p = bli_array_elem( index, array ); + pool_t* pool = *pool_p; + + // If the element is NULL, then it means a pool_t has not yet been created + // and allocated for the given index (thread id). + if ( pool == NULL ) + { + // Settle on the parameters to use when initializing the pool_t for + // the current index within the array_t. + const siz_t num_blocks = 1; + const siz_t block_ptrs_len = 10; + const siz_t align_size = 16; + malloc_ft malloc_fp = BLIS_MALLOC_INTL; + free_ft free_fp = BLIS_FREE_INTL; + + // Each small block pool should contain blocks large enough to + // accommodate any of the data structures for which they will be + // used. + const siz_t n_sizes = 4; + siz_t sizes[4] = { sizeof( cntl_t ), + sizeof( packm_params_t ), + sizeof( thrcomm_t ), + sizeof( thrinfo_t ) }; + siz_t block_size = 0; + + // Find the largest of the sizes above and use that as the block_size + // for the pool. + for ( dim_t i = 0; i < n_sizes; ++i ) + { + if ( block_size < sizes[i] ) block_size = sizes[i]; + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_array_elem(): pool_t for tid %d is NULL; allocating pool_t.\n", + ( int )index ); + printf( "bli_apool_array_elem(): allocating pool_t: " ); + #endif + + // Allocate the pool_t. + pool = bli_malloc_intl( sizeof( pool_t ) ); + + // Initialize the pool_t. + bli_pool_init + ( + num_blocks, + block_ptrs_len, + block_size, + align_size, + malloc_fp, + free_fp, + pool + ); + + // Update the array element with the address to the new pool_t. + // NOTE: We pass in the address of the pool_t* since the bli_array + // API is generalized for arbitrarily-sized elements, and therefore + // it must always take the address of the data, rather than the + // value (which it can only do if the elem size were fixed). + bli_array_set_elem( &pool, index, array ); + } + + // The array element is now guaranteed to refer to an allocated and + // initialized pool_t. + + // Return the array element. + return pool; +} + +void bli_apool_grow + ( + siz_t num_blocks_add, + apool_t* restrict apool + ) +{ + // If the requested increase is zero, return early. + if ( num_blocks_add == 0 ) return; + + // Query the underlying pool_t from the apool_t. + pool_t* restrict pool = bli_apool_pool( apool ); + + // Query the default initial array length from the apool_t. + const siz_t num_elem = bli_apool_def_array_len( apool ); + + // ---------------------------------------------------------------------------- + + // Query the allocated length of the block_ptrs array and also the + // total number of blocks currently allocated. + const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); + const siz_t num_blocks_cur = bli_pool_num_blocks( pool ); + + // Compute the total number of allocated blocks that will exist + // after we grow the pool. + const siz_t num_blocks_new = num_blocks_cur + num_blocks_add; + + // If adding num_blocks_add new blocks will exceed the current capacity + // of the block_ptrs array, we need to first put in place a new (larger) + // array. + if ( block_ptrs_len_cur < num_blocks_new ) + { + // To prevent this from happening often, we double the current + // length of the block_ptrs array. + const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur; + + // Query the current block_ptrs array. + array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ", + ( int )block_ptrs_len_cur, ( int )block_ptrs_len_new ); + #endif + + // Allocate a new block_ptrs array. + array_t** restrict block_ptrs_new + = + bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ) ); + + // Query the top_index of the pool. + const siz_t top_index = bli_pool_top_index( pool ); + + // Copy the contents of the old block_ptrs array to the new/resized + // array. Notice that we can begin with top_index since all entries + // from 0 to top_index-1 have been (and are currently) checked out + // to threads. + for ( dim_t i = top_index; i < num_blocks_cur; ++i ) + { + block_ptrs_new[i] = block_ptrs_cur[i]; + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_grow(): freeing prev block_ptrs: " ); + #endif + + // Free the old block_ptrs array. + bli_free_intl( block_ptrs_cur ); + + // Update the pool_t struct with the new block_ptrs array and + // record its allocated length. + bli_pool_set_block_ptrs( block_ptrs_new, pool ); + bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool ); + } + + // At this point, we are guaranteed to have enough unused elements + // in the block_ptrs array to accommodate an additional num_blocks_add + // blocks. + + // Query the current block_ptrs array (which was maybe just resized). + array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); + + // Query the malloc() function pointer for the pool. + malloc_ft malloc_fp = bli_pool_malloc_fp( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n", + ( int )num_blocks_cur, ( int )num_blocks_new ); + fflush( stdout ); + #endif + + // Allocate the requested additional blocks in the resized array. + for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i ) + { + bli_apool_alloc_block + ( + num_elem, + malloc_fp, + &(block_ptrs[i]) + ); + } + + // Update the pool_t struct with the new number of allocated blocks. + // Notice that top_index remains unchanged, as do the block_size and + // align_size fields. + bli_pool_set_num_blocks( num_blocks_new, pool ); +} + diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h new file mode 100644 index 000000000..ddbceb9a2 --- /dev/null +++ b/frame/base/bli_apool.h @@ -0,0 +1,145 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_APOOL_H +#define BLIS_APOOL_H + +// -- Locked pool-of-arrays type -- + +/* +typedef struct +{ + bli_pthread_mutex_t mutex; + pool_t pool; + + siz_t def_array_len; + +} apool_t; +*/ + + +// apool entry query + +static pool_t* bli_apool_pool( apool_t* apool ) +{ + return &(apool->pool); +} + +static bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) +{ + return &(apool->mutex); +} + +static siz_t bli_apool_def_array_len( apool_t* pool ) +{ + return pool->def_array_len; +} + +static bool_t bli_apool_is_exhausted( apool_t* apool ) +{ + pool_t* restrict pool = bli_apool_pool( apool ); + + return bli_pool_is_exhausted( pool ); +} + +// apool action + +static void bli_apool_lock( apool_t* apool ) +{ + bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); +} + +static void bli_apool_unlock( apool_t* apool ) +{ + bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); +} + +// apool entry modification + +static void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ +{ + pool->def_array_len = def_array_len; +} + +// ----------------------------------------------------------------------------- + +void bli_apool_init + ( + malloc_ft malloc_fp, + free_ft free_fp, + apool_t* restrict apool + ); +void bli_apool_finalize + ( + apool_t* restrict apool + ); + +array_t* bli_apool_checkout_array + ( + siz_t n_threads, + apool_t* restrict apool + ); +void bli_apool_checkin_array + ( + array_t* restrict array, + apool_t* restrict apool + ); + +pool_t* bli_apool_array_elem + ( + siz_t index, + array_t* restrict array + ); + +void bli_apool_grow + ( + siz_t num_blocks_add, + apool_t* restrict apool + ); + +void bli_apool_alloc_block + ( + siz_t num_elem, + malloc_ft malloc_fp, + array_t** restrict array_p + ); +void bli_apool_free_block + ( + free_ft free_fp, + array_t* restrict array + ); + + +#endif + diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c new file mode 100644 index 000000000..3f167056e --- /dev/null +++ b/frame/base/bli_array.c @@ -0,0 +1,204 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +//#define BLIS_ENABLE_MEM_TRACING + +void bli_array_init + ( + const siz_t num_elem, + const siz_t elem_size, + array_t* restrict array + ) +{ + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_array_init(): allocating array [%d * %d]: ", + ( int )num_elem, ( int )elem_size ); + #endif + + // Compute the total size (in bytes) of the array. + const size_t array_size = num_elem * elem_size; + + // Allocate the array buffer. + void* restrict buf = bli_malloc_intl( array_size ); + + // Initialize the array elements to zero. THIS IS IMPORANT because + // consumer threads will use the NULL-ness of the array elements to + // determine if the corresponding block (data structure) needs to be + // created/allocated and initialized. + memset( buf, 0, array_size ); + + // Initialize the array_t structure. + bli_array_set_buf( buf, array ); + bli_array_set_num_elem( num_elem, array ); + bli_array_set_elem_size( elem_size, array ); +} + +void bli_array_resize + ( + const siz_t num_elem_new, + array_t* restrict array + ) +{ + // Query the number of elements in the array. + const siz_t num_elem_prev = bli_array_num_elem( array ); + + // If the new requested size (number of elements) is less than or equal to + // the current size, no action is needed; return early. + if ( num_elem_new <= num_elem_prev ) return; + + // At this point, we know that num_elem_prev < num_elem_new, which means + // we need to proceed with the resizing. + + // Query the size of each element in the array. + const siz_t elem_size = bli_array_elem_size( array ); + + // Compute the total size (in bytes) of the array before and after resizing. + const size_t array_size_prev = num_elem_prev * elem_size; + const size_t array_size_new = num_elem_new * elem_size; + + // Query the previous array buffer. + void* restrict buf_prev = bli_array_buf( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_array_resize(): allocating array [%d * %d]: ", + ( int )num_elem_new, ( int )elem_size ); + #endif + + // Allocate a new array buffer. + char* restrict buf_new = bli_malloc_intl( array_size_new ); + + // Copy the previous array contents to the new array. + memcpy( buf_new, buf_prev, array_size_prev ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_array_resize(): freeing array [%d * %d]: ", + ( int )num_elem_prev, ( int )elem_size ); + #endif + + // Now that the elements have been copied over to the new buffer, we can + // free the previous array buffer. + bli_free_intl( buf_prev ); + + // Initialize the new elements' contents to zero. (Note that we advance + // the new buffer address by the size of the previous array so that we + // arrive at the first byte of the new segment.) + memset( &buf_new[ array_size_prev ], 0, array_size_new - array_size_prev ); + + // Update the array_t structure. + // NOTE: The array elem_size field does not need updating. + bli_array_set_buf( buf_new, array ); + bli_array_set_num_elem( num_elem_new, array ); +} + +void bli_array_finalize + ( + array_t* restrict array + ) +{ + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_array_finalize(): freeing buf (length %d): ", + ( int )bli_array_num_elem( array ) ); + #endif + + // Query the buffer from the array. + void* restrict buf = bli_array_buf( array ); + + // Free the buffer. + bli_free_intl( buf ); +} + +void* bli_array_elem + ( + const siz_t index, + array_t* restrict array + ) +{ + // Query the number of elements in the array. + const siz_t num_elem = bli_array_num_elem( array ); + + // Sanity check: disallow access beyond the bounds of the array. + if ( num_elem <= index ) bli_abort(); + + // Query the size of each element in the array. + const siz_t elem_size = bli_array_elem_size( array ); + + // Query the buffer from the array, but store it as a char* so we can use + // it to easily perform byte pointer arithmetic. + char* restrict buf = bli_array_buf( array ); + + // Advance the pointer by (index * elem_size) bytes. + buf += index * elem_size; + + // Return the address of the element computed above. + return ( void* )buf; +} + +void bli_array_set_elem + ( + void* restrict elem, + const siz_t index, + array_t* restrict array + ) +{ + // Query the size of each element in the array. + const siz_t elem_size = bli_array_elem_size( array ); + + // Query the buffer from the array as a char*. + char* restrict buf = bli_array_buf( array ); + + if ( elem_size == sizeof( void* ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_array_set_elem(): elem_size is %d; setting index %d.\n", + ( int )elem_size, ( int )index ); + fflush( stdout ); + #endif + + // Special case: Handle elem_size = sizeof( void* ) without calling + // memcpy(). + void** restrict buf_vvp = ( void** )buf; + void** restrict elem_vvp = ( void** )elem; + + buf_vvp[ index ] = *elem_vvp; + } + else + { + // General case: Copy the elem_size bytes from elem to buf at the + // element index specified by index. + memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size ); + } +} + diff --git a/frame/base/bli_array.h b/frame/base/bli_array.h new file mode 100644 index 000000000..e3070ae67 --- /dev/null +++ b/frame/base/bli_array.h @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_ARRAY_H +#define BLIS_ARRAY_H + +// -- Array type -- + +/* +typedef struct +{ + void* buf; + + siz_t num_elem; + siz_t elem_size; + +} array_t; +*/ + + +// Array entry query + +static void* bli_array_buf( array_t* array ) +{ + return array->buf; +} + +static siz_t bli_array_num_elem( array_t* array ) +{ + return array->num_elem; +} + +static siz_t bli_array_elem_size( array_t* array ) +{ + return array->elem_size; +} + +// Array entry modification + +static void bli_array_set_buf( void* buf, array_t* array ) \ +{ + array->buf = buf; +} + +static void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ +{ + array->num_elem = num_elem; +} + +static void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ +{ + array->elem_size = elem_size; +} + +// ----------------------------------------------------------------------------- + +void bli_array_init + ( + const siz_t num_elem, + const siz_t elem_size, + array_t* restrict array + ); +void bli_array_resize + ( + const siz_t num_elem_new, + array_t* restrict array + ); +void bli_array_finalize + ( + array_t* restrict array + ); + +void* bli_array_elem + ( + const siz_t index, + array_t* restrict array + ); +void bli_array_set_elem + ( + void* restrict elem, + const siz_t index, + array_t* restrict array + ); + +#endif + diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c index 2ca606ec7..ac434a0c5 100644 --- a/frame/base/bli_check.c +++ b/frame/base/bli_check.c @@ -776,7 +776,20 @@ err_t bli_check_object_buffer( obj_t* a ) return e_val; } -// -- Memory allocator checks -------------------------------------------------- +// -- Memory checks ------------------------------------------------------------ + +err_t bli_check_valid_malloc_buf( void* ptr ) +{ + err_t e_val = BLIS_SUCCESS; + + if ( ptr == NULL ) + e_val = BLIS_MALLOC_RETURNED_NULL; + + return e_val; +} + + +// -- Internal memory pool checks ---------------------------------------------- err_t bli_check_valid_packbuf( packbuf_t buf_type ) { diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h index df14d81d4..4bb3807d3 100644 --- a/frame/base/bli_check.h +++ b/frame/base/bli_check.h @@ -98,6 +98,8 @@ err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); +err_t bli_check_valid_malloc_buf( void* ptr ); + err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( num_t dt, cntx_t* cntx ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index f7565fc70..9020ae8b4 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -37,6 +37,7 @@ cntl_t* bli_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, @@ -47,12 +48,12 @@ cntl_t* bli_cntl_create_node cntl_t* cntl; mem_t* pack_mem; - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_create_node(): " ); #endif // Allocate the cntl_t struct. - cntl = bli_malloc_intl( sizeof( cntl_t ) ); + cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) ); bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); @@ -72,14 +73,15 @@ cntl_t* bli_cntl_create_node void bli_cntl_free_node ( + rntm_t* rntm, cntl_t* cntl ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_node(): " ); #endif - bli_free_intl( cntl ); + bli_sba_release( rntm, cntl ); } void bli_cntl_clear_node @@ -105,17 +107,19 @@ void bli_cntl_clear_node void bli_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ) { - if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread ); - else bli_cntl_free_wo_thrinfo( cntl ); + if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread ); + else bli_cntl_free_wo_thrinfo( rntm, cntl ); } void bli_cntl_free_w_thrinfo ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ) { @@ -133,17 +137,17 @@ void bli_cntl_free_w_thrinfo { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node ); + bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_w_thrinfo(): " ); #endif - bli_free_intl( cntl_params ); + bli_sba_release( rntm, cntl_params ); } // Release the current node's pack mem_t entry back to the memory @@ -152,19 +156,20 @@ void bli_cntl_free_w_thrinfo if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" ); #endif - bli_membrk_release( cntl_pack_mem ); + bli_membrk_release( rntm, cntl_pack_mem ); } // Free the current node. - bli_cntl_free_node( cntl ); + bli_cntl_free_node( rntm, cntl ); } void bli_cntl_free_wo_thrinfo ( + rntm_t* rntm, cntl_t* cntl ) { @@ -178,13 +183,13 @@ void bli_cntl_free_wo_thrinfo { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free_wo_thrinfo( cntl_sub_node ); + bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node ); } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) { - bli_free_intl( cntl_params ); + bli_sba_release( rntm, cntl_params ); } // Release the current node's pack mem_t entry back to the memory @@ -192,17 +197,18 @@ void bli_cntl_free_wo_thrinfo // allocated. if ( bli_mem_is_alloc( cntl_pack_mem ) ) { - bli_membrk_release( cntl_pack_mem ); + bli_membrk_release( rntm, cntl_pack_mem ); } // Free the current node. - bli_cntl_free_node( cntl ); + bli_cntl_free_node( rntm, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy ( + rntm_t* rntm, cntl_t* cntl ) { @@ -212,6 +218,7 @@ cntl_t* bli_cntl_copy // field. cntl_t* cntl_copy = bli_cntl_create_node ( + rntm, bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), @@ -227,7 +234,7 @@ cntl_t* bli_cntl_copy // struct. uint64_t params_size = bli_cntl_params_size( cntl ); void* params_orig = bli_cntl_params( cntl ); - void* params_copy = bli_malloc_intl( ( size_t )params_size ); + void* params_copy = bli_sba_acquire( rntm, ( size_t )params_size ); // Copy the original params struct to the new memory region. memcpy( params_copy, params_orig, params_size ); @@ -242,6 +249,7 @@ cntl_t* bli_cntl_copy { cntl_t* sub_node_copy = bli_cntl_copy ( + rntm, bli_cntl_sub_node( cntl ) ); diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 14b97b525..5db5b62bb 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -60,6 +60,7 @@ typedef struct cntl_s cntl_t; cntl_t* bli_cntl_create_node ( + rntm_t* rntm, opid_t family, bszid_t bszid, void* var_func, @@ -69,6 +70,7 @@ cntl_t* bli_cntl_create_node void bli_cntl_free_node ( + rntm_t* rntm, cntl_t* cntl ); @@ -81,23 +83,27 @@ void bli_cntl_clear_node void bli_cntl_free ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ); void bli_cntl_free_w_thrinfo ( - cntl_t* cntl, + rntm_t* rntm, + cntl_t* cntl, thrinfo_t* thread ); void bli_cntl_free_wo_thrinfo ( - cntl_t* cntl + rntm_t* rntm, + cntl_t* cntl ); cntl_t* bli_cntl_copy ( + rntm_t* rntm, cntl_t* cntl ); diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 2a036b183..ccc415639 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -91,10 +91,32 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) // Allocate some temporary local arrays. + + + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -283,10 +305,30 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) } // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bli_free_intl( blkszs ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bli_free_intl( bszids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bli_free_intl( bmults ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bli_free_intl( dsclrs ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_blkszs(): " ); + #endif bli_free_intl( msclrs ); } @@ -323,8 +365,20 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) if ( method == BLIS_NAT ) return; // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif dsclrs = bli_malloc_intl( n_bs * sizeof( double ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif msclrs = bli_malloc_intl( n_bs * sizeof( double ) ); // -- Begin variable argument section -- @@ -444,8 +498,20 @@ void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... ) } // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif bli_free_intl( bszids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif bli_free_intl( dsclrs ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_ind_blkszs(): " ); + #endif bli_free_intl( msclrs ); } @@ -476,9 +542,25 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) dim_t i; // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif bool_t* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool_t ) ); // -- Begin variable argument section -- @@ -566,9 +648,24 @@ void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) } // Free the temporary local arrays. + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif bli_free_intl( ukr_ids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif bli_free_intl( ukr_dts ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif bli_free_intl( ukr_fps ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l3_nat_ukrs(): " ); + #endif bli_free_intl( ukr_prefs ); } @@ -599,8 +696,20 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) dim_t i; // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); // -- Begin variable argument section -- @@ -661,8 +770,20 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) } // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif bli_free_intl( ker_ids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif bli_free_intl( ker_dts ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1f_kers(): " ); + #endif bli_free_intl( ker_fps ); } @@ -693,8 +814,20 @@ void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) dim_t i; // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); // -- Begin variable argument section -- @@ -755,8 +888,20 @@ void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) } // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif bli_free_intl( ker_ids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif bli_free_intl( ker_dts ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_l1v_kers(): " ); + #endif bli_free_intl( ker_fps ); } @@ -787,8 +932,20 @@ void bli_cntx_set_packm_kers( dim_t n_kers, ... ) dim_t i; // Allocate some temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ) ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif void** ker_fps = bli_malloc_intl( n_kers * sizeof( void* ) ); // -- Begin variable argument section -- @@ -849,8 +1006,20 @@ void bli_cntx_set_packm_kers( dim_t n_kers, ... ) } // Free the temporary local arrays. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif bli_free_intl( ker_ids ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif bli_free_intl( ker_dts ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_cntx_set_packm_kers(): " ); + #endif bli_free_intl( ker_fps ); } diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index df7a33a61..450c753b6 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -60,7 +60,6 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; - membrk_t* membrk; } cntx_t; */ @@ -122,10 +121,6 @@ static pack_t bli_cntx_schema_c_panel( cntx_t* cntx ) { return cntx->schema_c_panel; } -static membrk_t* bli_cntx_get_membrk( cntx_t* cntx ) -{ - return cntx->membrk; -} // ----------------------------------------------------------------------------- @@ -154,10 +149,6 @@ static void bli_cntx_set_schema_ab_blockpanel( pack_t sa, pack_t sb, cntx_t* cnt bli_cntx_set_schema_a_block( sa, cntx ); bli_cntx_set_schema_b_panel( sb, cntx ); } -static void bli_cntx_set_membrk( membrk_t* membrk, cntx_t* cntx ) -{ - cntx->membrk = membrk; -} // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index c5f45a9b8..24a84b4a1 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -150,6 +150,9 @@ void bli_error_init_msgs( void ) sprintf( bli_error_string_for_code(BLIS_EXPECTED_NONNULL_OBJECT_BUFFER), "Encountered object with non-zero dimensions containing null buffer." ); + sprintf( bli_error_string_for_code(BLIS_MALLOC_RETURNED_NULL), + "malloc() returned NULL; heap memory is likely exhausted." ); + sprintf( bli_error_string_for_code(BLIS_INVALID_PACKBUF), "Invalid packbuf_t value." ); sprintf( bli_error_string_for_code(BLIS_EXHAUSTED_CONTIG_MEMORY_POOL), diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index db2246289..4b8af449e 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -196,10 +196,18 @@ void bli_gks_finalize( void ) // If the current context was allocated, free it. if ( gks_id_ind != NULL ) { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind ); + #endif + bli_free_intl( gks_id_ind ); } } + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_gks_finalize(): gks for arch_t %d: ", ( int )id ); + #endif + // Free the array of BLIS_NUM_IND_METHODS cntx* elements. bli_free_intl( gks_id ); } @@ -320,6 +328,10 @@ void bli_gks_register_cntx // to register with an architecture id that has already been registered. if ( gks[ id ] != NULL ) return; + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_gks_register_cntx(): " ); + #endif + // At this point, we know the pointer to the array of cntx_t* is NULL and // needs to be allocated. Allocate the memory and initialize it to // zeros/NULL, storing the address of the alloacted memory at the element @@ -329,6 +341,10 @@ void bli_gks_register_cntx // Alias the allocated array for readability. cntx_t** restrict gks_id = gks[ id ]; + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_gks_register_cntx(): " ); + #endif + // Allocate memory for a single context and store the address at // the element in the gks[ id ] array that is reserved for native // execution. diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index bf4b24f24..76844ec23 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -87,9 +87,17 @@ gint_t bli_info_get_enable_cblas( void ) #endif } gint_t bli_info_get_blas_int_type_size( void ) { return BLIS_BLAS_INT_TYPE_SIZE; } -gint_t bli_info_get_enable_packbuf_pools( void ) +gint_t bli_info_get_enable_pba_pools( void ) { -#ifdef BLIS_ENABLE_PACKBUF_POOLS +#ifdef BLIS_ENABLE_PBA_POOLS + return 1; +#else + return 0; +#endif +} +gint_t bli_info_get_enable_sba_pools( void ) +{ +#ifdef BLIS_ENABLE_SBA_POOLS return 1; #else return 0; diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 3bbc91ec4..51b589aa5 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -58,7 +58,8 @@ gint_t bli_info_get_enable_stay_auto_init( void ); gint_t bli_info_get_enable_blas( void ); gint_t bli_info_get_enable_cblas( void ); gint_t bli_info_get_blas_int_type_size( void ); -gint_t bli_info_get_enable_packbuf_pools( void ); +gint_t bli_info_get_enable_pba_pools( void ); +gint_t bli_info_get_enable_sba_pools( void ); gint_t bli_info_get_enable_threading( void ); gint_t bli_info_get_enable_openmp( void ); gint_t bli_info_get_enable_pthreads( void ); diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 0cf500821..4ce5926a2 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -35,7 +35,7 @@ #include "blis.h" -//#define ENABLE_MEM_DEBUG +//#define BLIS_ENABLE_MEM_TRACING // ----------------------------------------------------------------------------- @@ -44,19 +44,22 @@ void* bli_malloc_pool( size_t size ) const malloc_ft malloc_fp = BLIS_MALLOC_POOL; const size_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; -#ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_malloc_pool(): size %ld, align size %ld\n", ( long )size, ( long )align_size ); -#endif + fflush( stdout ); + #endif return bli_fmalloc_align( malloc_fp, size, align_size ); } void bli_free_pool( void* p ) { -#ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_pool(): freeing block\n" ); -#endif + fflush( stdout ); + #endif + bli_ffree_align( BLIS_FREE_POOL, p ); } @@ -67,19 +70,22 @@ void* bli_malloc_user( size_t size ) const malloc_ft malloc_fp = BLIS_MALLOC_USER; const size_t align_size = BLIS_HEAP_ADDR_ALIGN_SIZE; -#ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_malloc_user(): size %ld, align size %ld\n", ( long )size, ( long )align_size ); -#endif + fflush( stdout ); + #endif return bli_fmalloc_align( malloc_fp, size, align_size ); } void bli_free_user( void* p ) { -#ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_user(): freeing block\n" ); -#endif + fflush( stdout ); + #endif + bli_ffree_align( BLIS_FREE_USER, p ); } @@ -89,21 +95,19 @@ void* bli_malloc_intl( size_t size ) { const malloc_ft malloc_fp = BLIS_MALLOC_INTL; -#ifdef ENABLE_MEM_DEBUG - printf( "bli_malloc_intl(): size %ld\n", - ( long )size ); -#endif + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_malloc_intl(): size %ld\n", ( long )size ); + fflush( stdout ); + #endif return bli_fmalloc_noalign( malloc_fp, size ); } void* bli_calloc_intl( size_t size ) { -#ifdef ENABLE_MEM_DEBUG -// printf( "bli_calloc_intl(): allocating block (size %ld)\n", -// ( long )size ); - printf( "calloc: " ); -#endif + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_calloc_intl(): " ); + #endif void* p = bli_malloc_intl( size ); @@ -114,9 +118,11 @@ void* bli_calloc_intl( size_t size ) void bli_free_intl( void* p ) { -#ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_intl(): freeing block\n" ); -#endif + fflush( stdout ); + #endif + bli_ffree_noalign( BLIS_FREE_INTL, p ); } diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index bd6733835..5f56f98c0 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -60,11 +60,6 @@ static pool_t* bli_mem_pool( mem_t* mem ) return mem->pool; } -static membrk_t* bli_mem_membrk( mem_t* mem ) -{ - return mem->membrk; -} - static siz_t bli_mem_size( mem_t* mem ) { return mem->size; @@ -105,11 +100,6 @@ static void bli_mem_set_pool( pool_t* pool, mem_t* mem ) mem->pool = pool; } -static void bli_mem_set_membrk( membrk_t* membrk, mem_t* mem ) -{ - mem->membrk = membrk; -} - static void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; @@ -120,7 +110,6 @@ static void bli_mem_clear( mem_t* mem ) bli_mem_set_buffer( NULL, mem ); bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); - bli_mem_set_membrk( NULL, mem ); } diff --git a/frame/base/bli_membrk.c b/frame/base/bli_membrk.c index 6293d5ca3..b1290c0eb 100644 --- a/frame/base/bli_membrk.c +++ b/frame/base/bli_membrk.c @@ -36,12 +36,22 @@ #include "blis.h" +static membrk_t global_membrk; + +// ----------------------------------------------------------------------------- + +membrk_t* bli_membrk_query( void ) +{ + return &global_membrk; +} + void bli_membrk_init ( - cntx_t* cntx, - membrk_t* membrk + cntx_t* restrict cntx ) { + membrk_t* restrict membrk = bli_membrk_query(); + const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; malloc_ft malloc_fp = BLIS_MALLOC_POOL; free_ft free_fp = BLIS_FREE_POOL; @@ -52,20 +62,22 @@ void bli_membrk_init bli_membrk_set_free_fp( free_fp, membrk ); bli_membrk_init_mutex( membrk ); -#ifdef BLIS_ENABLE_PACKBUF_POOLS +#ifdef BLIS_ENABLE_PBA_POOLS bli_membrk_init_pools( cntx, membrk ); #endif } void bli_membrk_finalize ( - membrk_t* membrk + void ) { + membrk_t* restrict membrk = bli_membrk_query(); + bli_membrk_set_malloc_fp( NULL, membrk ); bli_membrk_set_free_fp( NULL, membrk ); -#ifdef BLIS_ENABLE_PACKBUF_POOLS +#ifdef BLIS_ENABLE_PBA_POOLS bli_membrk_finalize_pools( membrk ); #endif bli_membrk_finalize_mutex( membrk ); @@ -73,7 +85,7 @@ void bli_membrk_finalize void bli_membrk_acquire_m ( - membrk_t* membrk, + rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem @@ -82,17 +94,22 @@ void bli_membrk_acquire_m pool_t* pool; pblk_t* pblk; dim_t pi; - siz_t block_size; - // If the internal memory pools for pack buffers are disabled, we - // spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the + // If the internal memory pools for packing block allocator are disabled, + // we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the // immediate usage of bli_membrk_malloc(). -#ifndef BLIS_ENABLE_PACKBUF_POOLS +#ifndef BLIS_ENABLE_PBA_POOLS buf_type = BLIS_BUFFER_FOR_GEN_USE; + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_membrk_acquire_m(): bli_malloc_pool(): size %ld\n", + ( long )req_size ); + #endif #endif - // Make sure the API is initialized. - //assert( membrk ); //?? + // Query the memory broker from the runtime. + membrk_t* membrk = bli_rntm_membrk( rntm ); + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { @@ -114,7 +131,6 @@ void bli_membrk_acquire_m bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( NULL, mem ); bli_mem_set_size( req_size, mem ); - bli_mem_set_membrk( membrk, mem ); } else { @@ -146,53 +162,54 @@ void bli_membrk_acquire_m // the struct's pblk_t field. bli_pool_checkout_block( req_size, pblk, pool ); - // Query the size of the blocks in the pool so we can store it in - // the mem_t object. At this point, it is guaranteed to be at - // least as large as req_size. (NOTE: We must perform the query - // within the critical section to ensure that the pool hasn't - // changed.) - block_size = bli_pool_block_size( pool ); - } // END CRITICAL SECTION // Release the mutex associated with the membrk object. bli_membrk_unlock( membrk ); + // Query the block_size from the pblk_t. This will be at least + // req_size, perhaps larger. + siz_t block_size = bli_pblk_block_size( pblk ); + // Initialize the mem_t object with: // - the buffer type (a packbuf_t value), // - the address of the memory pool to which it belongs, // - the size of the contiguous memory block (NOT the size of the // requested region), // - the membrk_t from which the mem_t entry was acquired. - // The actual addresses (system and aligned) are already stored in - // the mem_t struct's pblk_t field + // The actual (aligned) address is already stored in the mem_t + // struct's pblk_t field. bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( pool, mem ); bli_mem_set_size( block_size, mem ); - bli_mem_set_membrk( membrk, mem ); } } void bli_membrk_release ( - mem_t* mem + rntm_t* rntm, + mem_t* mem ) { packbuf_t buf_type; pool_t* pool; pblk_t* pblk; - siz_t block_size_cur; - siz_t block_size_prev; - membrk_t* membrk; - // Extract the membrk_t address from the mem_t object. - membrk = bli_mem_membrk( mem ); + // Query the memory broker from the runtime. + membrk_t* membrk = bli_rntm_membrk( rntm ); // Extract the buffer type so we know what kind of memory was allocated. buf_type = bli_mem_buf_type( mem ); +#ifndef BLIS_ENABLE_PBA_POOLS + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_membrk_release(): bli_free_pool(): size %ld\n", + ( long )bli_mem_size( mem ) ); + #endif +#endif + if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { free_ft free_fp = bli_membrk_free_fp( membrk ); @@ -211,37 +228,14 @@ void bli_membrk_release // Extract the address of the pblk_t struct within the mem_t struct. pblk = bli_mem_pblk( mem ); - // Query the size of the blocks that were in the pool at the time - // the pblk_t was checked out. (This is used below, in the critical - // section.) - block_size_prev = bli_mem_size( mem ); - // Acquire the mutex associated with the membrk object. bli_membrk_lock( membrk ); // BEGIN CRITICAL SECTION { - // Query the size of the blocks currently in the pool. - block_size_cur = bli_pool_block_size( pool ); - - // If the block size of the pool has changed since the pblk_t - // was checked out, then we need to free the pblk_t rather - // than check it back in. Why? Because the pool's block size - // has (most likely) increased to meet changing needs (example: - // larger cache blocksizes). Thus, the current pblk_t's smaller - // allocated size is of no use anymore. - if ( block_size_cur != block_size_prev ) - { - // Free the pblk_t using the appropriate function in the - // pool API. - bli_pool_free_block( pblk, pool ); - } - else - { - // Check the block back into the pool. - bli_pool_checkin_block( pblk, pool ); - } + // Check the block back into the pool. + bli_pool_checkin_block( pblk, pool ); } // END CRITICAL SECTION @@ -261,6 +255,7 @@ void bli_membrk_release } +#if 0 void bli_membrk_acquire_v ( membrk_t* membrk, @@ -273,6 +268,18 @@ void bli_membrk_acquire_v BLIS_BUFFER_FOR_GEN_USE, mem ); } +#endif + + +void bli_membrk_rntm_set_membrk + ( + rntm_t* rntm + ) +{ + membrk_t* membrk = bli_membrk_query(); + + bli_rntm_set_membrk( membrk, rntm ); +} siz_t bli_membrk_pool_size diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h index 2b24a93a3..4d00eae63 100644 --- a/frame/base/bli_membrk.h +++ b/frame/base/bli_membrk.h @@ -102,34 +102,34 @@ static void bli_membrk_unlock( membrk_t* membrk ) // ----------------------------------------------------------------------------- +membrk_t* bli_membrk_query( void ); + void bli_membrk_init ( - cntx_t* cntx, - membrk_t* membrk + cntx_t* cntx ); void bli_membrk_finalize ( - membrk_t* membrk + void ); void bli_membrk_acquire_m ( - membrk_t* membrk, + rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); -void bli_membrk_acquire_v - ( - membrk_t* membrk, - siz_t req_size, - mem_t* mem - ); - void bli_membrk_release ( - mem_t* mem + rntm_t* rntm, + mem_t* mem + ); + +void bli_membrk_rntm_set_membrk + ( + rntm_t* rntm ); siz_t bli_membrk_pool_size diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c index 0c9bad4e8..773ea7bbd 100644 --- a/frame/base/bli_memsys.c +++ b/frame/base/bli_memsys.c @@ -35,35 +35,29 @@ #include "blis.h" -static membrk_t global_membrk; - -// ----------------------------------------------------------------------------- - -membrk_t* bli_memsys_global_membrk( void ) -{ - return &global_membrk; -} - -// ----------------------------------------------------------------------------- - void bli_memsys_init( void ) { // Query a native context so we have something to pass into // bli_membrk_init_pools(). We use BLIS_DOUBLE for the datatype, // but the dt argument is actually only used when initializing // contexts for induced methods. - // NOTE: Instead of calling bli_gks_query_cntx(), we call // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). cntx_t* cntx_p = bli_gks_query_cntx_noinit(); - // Initialize the global membrk_t object and its memory pools. - bli_membrk_init( cntx_p, &global_membrk ); + // Initialize the packing block allocator and its data structures. + bli_membrk_init( cntx_p ); + + // Initialize the small block allocator and its data structures. + bli_sba_init(); } void bli_memsys_finalize( void ) { - // Finalize the global membrk_t object and its memory pools. - bli_membrk_finalize( &global_membrk ); + // Finalize the small block allocator and its data structures. + bli_sba_finalize(); + + // Finalize the global membrk_t object and its data structures. + bli_membrk_finalize(); } diff --git a/frame/base/bli_memsys.h b/frame/base/bli_memsys.h index f82a7bf91..4b19523ef 100644 --- a/frame/base/bli_memsys.h +++ b/frame/base/bli_memsys.h @@ -38,10 +38,6 @@ // ----------------------------------------------------------------------------- -membrk_t* bli_memsys_global_membrk( void ); - -// ----------------------------------------------------------------------------- - void bli_memsys_init( void ); void bli_memsys_finalize( void ); diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 049bc7344..c0d4a8132 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -48,6 +48,10 @@ void bli_obj_create bli_obj_create_without_buffer( dt, m, n, obj ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_obj_create(): " ); + #endif + bli_obj_alloc_buffer( rs, cs, 1, obj ); } @@ -232,6 +236,10 @@ void bli_obj_create_1x1 { bli_obj_create_without_buffer( dt, 1, 1, obj ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_obj_create_1x1(): " ); + #endif + bli_obj_alloc_buffer( 1, 1, 1, obj ); } @@ -277,7 +285,13 @@ void bli_obj_free // is a detached scalar (ie: if the buffer pointer refers to the // address of the internal scalar buffer). if ( bli_obj_buffer( obj ) != bli_obj_internal_scalar_buffer( obj ) ) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_obj_free(): " ); + #endif + bli_free_user( bli_obj_buffer( obj ) ); + } } } diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c index 4ed37a422..276169bfe 100644 --- a/frame/base/bli_pool.c +++ b/frame/base/bli_pool.c @@ -35,50 +35,68 @@ #include "blis.h" -//#define ENABLE_MEM_DEBUG +//#define BLIS_ENABLE_MEM_TRACING void bli_pool_init ( - dim_t num_blocks, - dim_t block_ptrs_len, - siz_t block_size, - siz_t align_size, - malloc_ft malloc_fp, - free_ft free_fp, - pool_t* pool + siz_t num_blocks, + siz_t block_ptrs_len, + siz_t block_size, + siz_t align_size, + malloc_ft malloc_fp, + free_ft free_fp, + pool_t* restrict pool ) { - pblk_t* block_ptrs; - dim_t i; + // Make sure that block_ptrs_len is at least num_blocks. + block_ptrs_len = bli_max( block_ptrs_len, num_blocks ); - // Make sure that num_block_ptrs is at least num_blocks. - if ( block_ptrs_len < num_blocks ) block_ptrs_len = num_blocks; - - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_init(): allocating block_ptrs array of size %ld\n", - ( long )( block_ptrs_len * sizeof( pblk_t ) ) ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_init(): allocating block_ptrs (length %d): ", + ( int )block_ptrs_len ); #endif // Allocate the block_ptrs array. - block_ptrs = bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ) ); + // FGVZ: Do we want to call malloc_fp() for internal data structures as + // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. + pblk_t* restrict block_ptrs + = + bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ) ); // Allocate and initialize each entry in the block_ptrs array. - for ( i = 0; i < num_blocks; ++i ) + for ( dim_t i = 0; i < num_blocks; ++i ) { - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_init(): allocating block %d of size %ld (align %ld)\n", - ( int )i, ( long )block_size, ( long )align_size ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_init(): allocating block %d of size %d (align %d).\n", + ( int )i, ( int )block_size, ( int )align_size ); + fflush( stdout ); #endif - bli_pool_alloc_block( block_size, align_size, - &(block_ptrs[i]), pool ); + bli_pool_alloc_block + ( + block_size, + align_size, + malloc_fp, + &(block_ptrs[i]) + ); } + // NOTE: The semantics of top_index approximate a stack, where a "full" + // stack (no blocks checked out) is one where top_index == 0 and an empty + // stack (all blocks checked out) one where top_index == num_blocks. + // (Here, num_blocks tracks the number of blocks currently allocated as + // part of the pool.) This "orientation" of the stack was chosen + // intentionally, in contrast to one where top_index == -1 means the + // stack is empty and top_index = num_blocks - 1 means the stack is + // full. The chosen scheme allows one to conceptualize the stack as a + // number line in which blocks are checked out from lowest to highest, + // and additional blocks are added at the higher end. + // Initialize the pool_t structure. bli_pool_set_block_ptrs( block_ptrs, pool ); bli_pool_set_block_ptrs_len( block_ptrs_len, pool ); - bli_pool_set_num_blocks( num_blocks, pool ); bli_pool_set_top_index( 0, pool ); + bli_pool_set_num_blocks( num_blocks, pool ); bli_pool_set_block_size( block_size, pool ); bli_pool_set_align_size( align_size, pool ); bli_pool_set_malloc_fp( malloc_fp, pool ); @@ -87,43 +105,49 @@ void bli_pool_init void bli_pool_finalize ( - pool_t* pool + pool_t* restrict pool ) { - pblk_t* block_ptrs; - dim_t num_blocks; - dim_t top_index; - dim_t i; - // NOTE: This implementation assumes that either: // - all blocks have been checked in by all threads, or // - some subset of blocks have been checked in and the caller // is bli_pool_reinit(). - // Query the current block_ptrs array. - block_ptrs = bli_pool_block_ptrs( pool ); + // Query the block_ptrs array. + pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); - // Query the total number of blocks presently allocated. - num_blocks = bli_pool_num_blocks( pool ); + // Query the total number of blocks currently allocated. + const siz_t num_blocks = bli_pool_num_blocks( pool ); // Query the top_index of the pool. - top_index = bli_pool_top_index( pool ); + const siz_t top_index = bli_pool_top_index( pool ); + + // Sanity check: The top_index should be zero. + if ( top_index != 0 ) bli_abort(); + + // Query the free() function pointer for the pool. + free_ft free_fp = bli_pool_free_fp( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d).\n", + ( int )num_blocks, ( int )bli_pool_block_size( pool ), + ( int )bli_pool_align_size( pool ) ); + fflush( stdout ); + #endif // Free the individual blocks currently in the pool. - for ( i = top_index; i < num_blocks; ++i ) + for ( dim_t i = 0; i < num_blocks; ++i ) { - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_finalize(): freeing block %d of size %ld (align %ld)\n", - ( int )i, ( long )bli_pool_block_size( pool ), - ( long )bli_pool_align_size( pool ) ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_finalize(): block %d: ", ( int )i ); #endif - bli_pool_free_block( &(block_ptrs[i]), pool ); + bli_pool_free_block( free_fp, &(block_ptrs[i]) ); } - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_finalize(): freeing block_ptrs array of size %ld\n", - ( long )( bli_pool_block_ptrs_len( pool ) * sizeof( pblk_t ) ) ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_finalize(): freeing block_ptrs (length %d): ", + ( int )( bli_pool_block_ptrs_len( pool ) ) ); #endif // Free the block_ptrs array. @@ -144,11 +168,11 @@ void bli_pool_finalize void bli_pool_reinit ( - dim_t num_blocks_new, - dim_t block_ptrs_len_new, - siz_t block_size_new, - siz_t align_size_new, - pool_t* pool + siz_t num_blocks_new, + siz_t block_ptrs_len_new, + siz_t block_size_new, + siz_t align_size_new, + pool_t* restrict pool ) { // Preserve the pointers to malloc() and free() provided when the pool @@ -158,60 +182,66 @@ void bli_pool_reinit // Finalize the pool as it is currently configured. If some blocks // are still checked out to threads, those blocks are not freed - // here, and instead will be freed when the threads are ready to - // release the blocks. (This will happen because the threads will - // notice that the block size of the pool has changed.) + // here, and instead will be freed when the threads attempt to check + // those blocks back into the pool. (This condition can be detected + // since the block size is encoded into each pblk, which is copied + // upon checkout.) bli_pool_finalize( pool ); // Reinitialize the pool with the new parameters, in particular, // the new block size. - bli_pool_init( num_blocks_new, - block_ptrs_len_new, - block_size_new, - align_size_new, - malloc_fp, - free_fp, - pool ); + bli_pool_init + ( + num_blocks_new, + block_ptrs_len_new, + block_size_new, + align_size_new, + malloc_fp, + free_fp, + pool + ); } void bli_pool_checkout_block ( - siz_t req_size, - pblk_t* block, - pool_t* pool + siz_t req_size, + pblk_t* restrict block, + pool_t* restrict pool ) { - pblk_t* block_ptrs; - dim_t top_index; - + // If the requested block size is smaller than what the pool was + // initialized with, reinitialize the pool to contain blocks of the + // requested size. if ( bli_pool_block_size( pool ) < req_size ) { - const dim_t num_blocks_new = bli_pool_num_blocks( pool ); - const dim_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool ); + const siz_t num_blocks_new = bli_pool_num_blocks( pool ); + const siz_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool ); const siz_t align_size_new = bli_pool_align_size( pool ); - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_checkout_block(): old block size %ld < req size %ld; " - "reiniting", - ( long )bli_pool_block_size( pool ), ( long )req_size ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_checkout_block(): old block size %d < req size %d; " + "reiniting.\n", + ( int )bli_pool_block_size( pool ), ( int )req_size ); + fflush( stdout ); #endif - // If the requested block size is smaller than what the pool - // was initialized with, reinitialize the pool to contain blocks - // of the requested size. - bli_pool_reinit( num_blocks_new, - block_ptrs_len_new, - req_size, - align_size_new, - pool ); + bli_pool_reinit + ( + num_blocks_new, + block_ptrs_len_new, + req_size, + align_size_new, + pool + ); } // If the pool is exhausted, add a block. if ( bli_pool_is_exhausted( pool ) ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkout_block(): pool is exhausted (block size %d); " "growing by 1.\n", ( int )bli_pool_block_size( pool ) ); + fflush( stdout ); #endif bli_pool_grow( 1, pool ); @@ -219,21 +249,22 @@ void bli_pool_checkout_block // At this point, at least one block is guaranteed to be available. - // Query the current block_ptrs array. - block_ptrs = bli_pool_block_ptrs( pool ); + // Query the block_ptrs array. + pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. - top_index = bli_pool_top_index( pool ); + const siz_t top_index = bli_pool_top_index( pool ); - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_checkout_block(): checking out block %d of size %ld (align %ld)\n", - ( int )top_index, ( long )bli_pool_block_size( pool ), - ( long )bli_pool_align_size( pool ) ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_checkout_block(): checking out block %d of size %d " + "(align %d).\n", + ( int )top_index, ( int )bli_pool_block_size( pool ), + ( int )bli_pool_align_size( pool ) ); + fflush( stdout ); #endif - // Copy the block at top_index to the caller's pblk_t struct. - //bli_pblk_copy( *(block_ptrs[top_index]), *block ); - *block = block_ptrs[top_index]; + // Copy the pblk_t at top_index to the caller's pblk_t struct. + *block = block_ptrs[ top_index ]; // Notice that we don't actually need to clear the contents of // block_ptrs[top_index]. It will get overwritten eventually when @@ -246,28 +277,39 @@ void bli_pool_checkout_block void bli_pool_checkin_block ( - pblk_t* block, - pool_t* pool + pblk_t* restrict block, + pool_t* restrict pool ) { - pblk_t* block_ptrs; - dim_t top_index; + // If the pblk_t being checked in was allocated with a different block + // size than is currently in use in the pool, we simply free it and + // return. These "orphaned" blocks are no longer of use because the pool + // has since been reinitialized to a different (larger) block size. + if ( bli_pblk_block_size( block ) != bli_pool_block_size( pool ) ) + { + // Query the free() function pointer for the pool. + free_ft free_fp = bli_pool_free_fp( pool ); - // Query the current block_ptrs array. - block_ptrs = bli_pool_block_ptrs( pool ); + bli_pool_free_block( free_fp, block ); + return; + } + + // Query the block_ptrs array. + pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. - top_index = bli_pool_top_index( pool ); + const siz_t top_index = bli_pool_top_index( pool ); - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_checkin_block(): checking in block %d of size %ld (align %ld)\n", - ( int )top_index - 1, ( long )bli_pool_block_size( pool ), - ( long )bli_pool_align_size( pool ) ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_checkin_block(): checking in block %d of size %d " + "(align %d).\n", + ( int )top_index - 1, ( int )bli_pool_block_size( pool ), + ( int )bli_pool_align_size( pool ) ); + fflush( stdout ); #endif // Copy the caller's pblk_t struct to the block at top_index - 1. - //bli_pblk_copy( *(block_ptrs[top_index-1]), *block ); - block_ptrs[top_index-1] = *block; + block_ptrs[ top_index - 1 ] = *block; // Decrement the pool's top_index. bli_pool_set_top_index( top_index - 1, pool ); @@ -275,64 +317,60 @@ void bli_pool_checkin_block void bli_pool_grow ( - dim_t num_blocks_add, - pool_t* pool + siz_t num_blocks_add, + pool_t* restrict pool ) { - pblk_t* block_ptrs_cur; - dim_t block_ptrs_len_cur; - dim_t num_blocks_cur; - - pblk_t* block_ptrs_new; - dim_t num_blocks_new; - - siz_t block_size; - siz_t align_size; - dim_t top_index; - - dim_t i; - - // If the requested increase is zero (or negative), return early. - if ( num_blocks_add < 1 ) return; + // If the requested increase is zero, return early. + if ( num_blocks_add == 0 ) return; // Query the allocated length of the block_ptrs array and also the - // total number of blocks allocated. - block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); - num_blocks_cur = bli_pool_num_blocks( pool ); + // total number of blocks currently allocated. + const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); + const siz_t num_blocks_cur = bli_pool_num_blocks( pool ); // Compute the total number of allocated blocks that will exist // after we grow the pool. - num_blocks_new = num_blocks_cur + num_blocks_add; + const siz_t num_blocks_new = num_blocks_cur + num_blocks_add; - // If the new total number of allocated blocks is larger than the - // allocated length of the block_ptrs array, we need to allocate - // a new (larger) block_ptrs array. + // If adding num_blocks_add new blocks will exceed the current capacity + // of the block_ptrs array, we need to first put in place a new (larger) + // array. if ( block_ptrs_len_cur < num_blocks_new ) { - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_grow(): growing block_ptrs_len from %d to %d.\n", - ( int )block_ptrs_len_cur, ( int )num_blocks_new ); + // To prevent this from happening often, we double the current + // length of the block_ptrs array. + const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur; + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_grow(): growing block_ptrs_len (%d -> %d): ", + ( int )block_ptrs_len_cur, ( int )block_ptrs_len_new ); #endif // Query the current block_ptrs array. - block_ptrs_cur = bli_pool_block_ptrs( pool ); + pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); - // Allocate a new block_ptrs array of length num_blocks_new. - block_ptrs_new = bli_malloc_intl( num_blocks_new * sizeof( pblk_t ) ); + // Allocate a new block_ptrs array. + // FGVZ: Do we want to call malloc_fp() for internal data structures as + // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. + pblk_t* restrict block_ptrs_new + = + bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ) ); // Query the top_index of the pool. - top_index = bli_pool_top_index( pool ); + const siz_t top_index = bli_pool_top_index( pool ); // Copy the contents of the old block_ptrs array to the new/resized // array. Notice that we can begin with top_index since all entries - // from 0 to top_index-1 have been checked out to threads. - for ( i = top_index; i < num_blocks_cur; ++i ) + // from 0 to top_index-1 have been (and are currently) checked out + // to threads. + for ( dim_t i = top_index; i < num_blocks_cur; ++i ) { block_ptrs_new[i] = block_ptrs_cur[i]; } - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_grow(): freeing previous block_ptrs array.\n" ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_grow(): freeing prev block_ptrs: " ); #endif // Free the old block_ptrs array. @@ -341,30 +379,39 @@ void bli_pool_grow // Update the pool_t struct with the new block_ptrs array and // record its allocated length. bli_pool_set_block_ptrs( block_ptrs_new, pool ); - bli_pool_set_block_ptrs_len( num_blocks_new, pool ); + bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool ); } // At this point, we are guaranteed to have enough unused elements // in the block_ptrs array to accommodate an additional num_blocks_add // blocks. - // Query the current block_ptrs array (which was possibly just resized). - block_ptrs_cur = bli_pool_block_ptrs( pool ); + // Query the current block_ptrs array (which was mabye just resized). + pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the block size and alignment size of the pool. - block_size = bli_pool_block_size( pool ); - align_size = bli_pool_align_size( pool ); + const siz_t block_size = bli_pool_block_size( pool ); + const siz_t align_size = bli_pool_align_size( pool ); - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_grow(): growing pool from from %d to %d.\n", + // Query the malloc() function pointer for the pool. + malloc_ft malloc_fp = bli_pool_malloc_fp( pool ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_grow(): growing pool from (%d -> %d).\n", ( int )num_blocks_cur, ( int )num_blocks_new ); + fflush( stdout ); #endif // Allocate the requested additional blocks in the resized array. - for ( i = num_blocks_cur; i < num_blocks_new; ++i ) + for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i ) { - bli_pool_alloc_block( block_size, align_size, - &(block_ptrs_cur[i]), pool ); + bli_pool_alloc_block + ( + block_size, + align_size, + malloc_fp, + &(block_ptrs[i]) + ); } // Update the pool_t struct with the new number of allocated blocks. @@ -375,48 +422,41 @@ void bli_pool_grow void bli_pool_shrink ( - dim_t num_blocks_sub, - pool_t* pool + siz_t num_blocks_sub, + pool_t* restrict pool ) { - pblk_t* block_ptrs; - dim_t num_blocks; - dim_t num_blocks_avail; - dim_t num_blocks_new; + // If the requested decrease is zero, return early. + if ( num_blocks_sub == 0 ) return; - dim_t top_index; - - dim_t i; - - // Query the total number of blocks presently allocated. - num_blocks = bli_pool_num_blocks( pool ); + // Query the total number of blocks currently allocated. + const siz_t num_blocks = bli_pool_num_blocks( pool ); // Query the top_index of the pool. - top_index = bli_pool_top_index( pool ); + const siz_t top_index = bli_pool_top_index( pool ); // Compute the number of blocks available to be checked out // (and thus available for removal). - num_blocks_avail = num_blocks - top_index; + const siz_t num_blocks_avail = num_blocks - top_index; // If the requested decrease is more than the number of available - // blocks in the pool, only remove the number of blocks available. - if ( num_blocks_avail < num_blocks_sub ) - num_blocks_sub = num_blocks_avail; + // blocks in the pool, only remove the number of blocks actually + // available. + num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail ); - // If the effective requested decrease is zero (or the requested - // decrease was negative), return early. - if ( num_blocks_sub < 1 ) return; - - // Query the current block_ptrs array. - block_ptrs = bli_pool_block_ptrs( pool ); + // Query the block_ptrs array. + pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Compute the new total number of blocks. - num_blocks_new = num_blocks - num_blocks_sub; + const siz_t num_blocks_new = num_blocks - num_blocks_sub; + + // Query the free() function pointer for the pool. + free_ft free_fp = bli_pool_free_fp( pool ); // Free the individual blocks. - for ( i = num_blocks_new; i < num_blocks; ++i ) + for ( dim_t i = num_blocks_new; i < num_blocks; ++i ) { - bli_pool_free_block( &(block_ptrs[i]), pool ); + bli_pool_free_block( free_fp, &(block_ptrs[i]) ); } // Update the pool_t struct. @@ -429,24 +469,24 @@ void bli_pool_shrink void bli_pool_alloc_block ( - siz_t block_size, - siz_t align_size, - pblk_t* block, - pool_t* pool + siz_t block_size, + siz_t align_size, + malloc_ft malloc_fp, + pblk_t* restrict block ) { - #ifdef ENABLE_MEM_DEBUG - //printf( "bli_pool_alloc_block(): allocating block of size %ld (align %ld)\n", - // ( long )block_size, ( long )align_size ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d)\n", + ( int )block_size, ( int )align_size ); + fflush( stdout ); #endif - // Query the malloc() function pointer from the pool. - malloc_ft malloc_fp = bli_pool_malloc_fp( pool ); - // Allocate the block via the bli_fmalloc_align() wrapper, which performs // alignment logic and opaquely saves the original pointer so that it can // be recovered when it's time to free the block. - void* buf = bli_fmalloc_align( malloc_fp, block_size, align_size ); + void* restrict buf + = + bli_fmalloc_align( malloc_fp, block_size, align_size ); #if 0 // NOTE: This code is disabled because it is not needed, since @@ -471,31 +511,26 @@ void bli_pool_alloc_block } #endif - //printf( "bli_pool_alloc_block(): bsize = %d; asize = %d\n", (int)block_size, (int)align_size ); - //printf( " sys = %p; align = %p\n", buf_sys, buf_align ); - // Save the results in the pblk_t structure. bli_pblk_set_buf( buf, block ); + bli_pblk_set_block_size( block_size, block ); } void bli_pool_free_block ( - pblk_t* block, - pool_t* pool + free_ft free_fp, + pblk_t* restrict block ) { - void* buf; - - #ifdef ENABLE_MEM_DEBUG - printf( "bli_pool_free_block(): freeing block.\n" ); + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_pool_free_block(): calling ffree_align(): size %d.\n", + ( int )bli_pblk_block_size( block ) ); + fflush( stdout ); #endif - // Query the free() function pointer from the pool. - free_ft free_fp = bli_pool_free_fp( pool ); - // Extract the pblk_t buffer, which is the aligned address returned from // bli_fmalloc_align() when the block was allocated. - buf = bli_pblk_buf( block ); + void* restrict buf = bli_pblk_buf( block ); // Free the block via the bli_ffree_align() wrapper, which recovers the // original pointer that was returned by the pool's malloc() function when @@ -505,34 +540,34 @@ void bli_pool_free_block void bli_pool_print ( - pool_t* pool + pool_t* restrict pool ) { pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); - dim_t block_ptrs_len = bli_pool_block_ptrs_len( pool ); - dim_t top_index = bli_pool_top_index( pool ); - dim_t num_blocks = bli_pool_num_blocks( pool ); - dim_t block_size = bli_pool_block_size( pool ); - dim_t align_size = bli_pool_align_size( pool ); + siz_t block_ptrs_len = bli_pool_block_ptrs_len( pool ); + siz_t top_index = bli_pool_top_index( pool ); + siz_t num_blocks = bli_pool_num_blocks( pool ); + siz_t block_size = bli_pool_block_size( pool ); + siz_t align_size = bli_pool_align_size( pool ); dim_t i; printf( "pool struct ---------------\n" ); printf( " block_ptrs: %p\n", block_ptrs ); - printf( " block_ptrs_len: %ld\n", ( long )block_ptrs_len ); - printf( " top_index: %ld\n", ( long )top_index ); - printf( " num_blocks: %ld\n", ( long )num_blocks ); - printf( " block_size: %ld\n", ( long )block_size ); - printf( " align_size: %ld\n", ( long )align_size ); + printf( " block_ptrs_len: %d\n", ( int )block_ptrs_len ); + printf( " top_index: %d\n", ( int )top_index ); + printf( " num_blocks: %d\n", ( int )num_blocks ); + printf( " block_size: %d\n", ( int )block_size ); + printf( " align_size: %d\n", ( int )align_size ); printf( " pblks sys align\n" ); for ( i = 0; i < num_blocks; ++i ) { - printf( " %ld: %p\n", ( long )i, bli_pblk_buf( &block_ptrs[i] ) ); + printf( " %d: %p\n", ( int )i, bli_pblk_buf( &block_ptrs[i] ) ); } } void bli_pblk_print ( - pblk_t* pblk + pblk_t* restrict pblk ) { void* buf = bli_pblk_buf( pblk ); diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h index 2b882ab67..0d39fd7d3 100644 --- a/frame/base/bli_pool.h +++ b/frame/base/bli_pool.h @@ -41,7 +41,9 @@ /* typedef struct { - void* buf; + void* buf; + siz_t block_size; + } pblk_t; */ @@ -50,11 +52,11 @@ typedef struct /* typedef struct { - pblk_t* block_ptrs; - dim_t block_ptrs_len; + void* block_ptrs; + siz_t block_ptrs_len; - dim_t top_index; - dim_t num_blocks; + siz_t top_index; + siz_t num_blocks; siz_t block_size; siz_t align_size; @@ -73,6 +75,11 @@ static void* bli_pblk_buf( pblk_t* pblk ) return pblk->buf; } +static siz_t bli_pblk_block_size( pblk_t* pblk ) +{ + return pblk->block_size; +} + // Pool block modification static void bli_pblk_set_buf( void* buf, pblk_t* pblk ) @@ -80,25 +87,31 @@ static void bli_pblk_set_buf( void* buf, pblk_t* pblk ) pblk->buf = buf; } +static void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) +{ + pblk->block_size = block_size; +} + static void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); + bli_pblk_set_block_size( 0, pblk ); } // Pool entry query -static pblk_t* bli_pool_block_ptrs( pool_t* pool ) +static void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } -static dim_t bli_pool_block_ptrs_len( pool_t* pool ) +static siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } -static dim_t bli_pool_num_blocks( pool_t* pool ) +static siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } @@ -123,7 +136,7 @@ static free_ft bli_pool_free_fp( pool_t* pool ) return pool->free_fp; } -static dim_t bli_pool_top_index( pool_t* pool ) +static siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } @@ -136,17 +149,17 @@ static bool_t bli_pool_is_exhausted( pool_t* pool ) // Pool entry modification -static void bli_pool_set_block_ptrs( pblk_t* block_ptrs, pool_t* pool ) \ +static void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } -static void bli_pool_set_block_ptrs_len( dim_t block_ptrs_len, pool_t* pool ) \ +static void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } -static void bli_pool_set_num_blocks( dim_t num_blocks, pool_t* pool ) \ +static void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } @@ -171,7 +184,7 @@ static void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ pool->free_fp = free_fp; } -static void bli_pool_set_top_index( dim_t top_index, pool_t* pool ) \ +static void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } @@ -180,70 +193,70 @@ static void bli_pool_set_top_index( dim_t top_index, pool_t* pool ) \ void bli_pool_init ( - dim_t num_blocks, - dim_t block_ptrs_len, - siz_t block_size, - siz_t align_size, - malloc_ft malloc_fp, - free_ft free_fp, - pool_t* pool + siz_t num_blocks, + siz_t block_ptrs_len, + siz_t block_size, + siz_t align_size, + malloc_ft malloc_fp, + free_ft free_fp, + pool_t* restrict pool ); void bli_pool_finalize ( - pool_t* pool + pool_t* restrict pool ); void bli_pool_reinit ( - dim_t num_blocks_new, - dim_t block_ptrs_len_new, - siz_t block_size_new, - siz_t align_size_new, - pool_t* pool + siz_t num_blocks_new, + siz_t block_ptrs_len_new, + siz_t block_size_new, + siz_t align_size_new, + pool_t* restrict pool ); void bli_pool_checkout_block ( - siz_t req_size, - pblk_t* block, - pool_t* pool + siz_t req_size, + pblk_t* restrict block, + pool_t* restrict pool ); void bli_pool_checkin_block ( - pblk_t* block, - pool_t* pool + pblk_t* restrict block, + pool_t* restrict pool ); void bli_pool_grow ( - dim_t num_blocks_add, - pool_t* pool + siz_t num_blocks_add, + pool_t* restrict pool ); void bli_pool_shrink ( - dim_t num_blocks_sub, - pool_t* pool + siz_t num_blocks_sub, + pool_t* restrict pool ); void bli_pool_alloc_block ( - siz_t block_size, - siz_t align_size, - pblk_t* block, - pool_t* pool + siz_t block_size, + siz_t align_size, + malloc_ft malloc_fp, + pblk_t* restrict block ); void bli_pool_free_block ( - pblk_t* block, - pool_t* pool + free_ft free_fp, + pblk_t* restrict block ); void bli_pool_print ( - pool_t* pool + pool_t* restrict pool ); void bli_pblk_print ( - pblk_t* pblk + pblk_t* restrict pblk ); #endif diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h index cde8aa72e..f33c25e36 100644 --- a/frame/base/bli_rntm.h +++ b/frame/base/bli_rntm.h @@ -49,7 +49,7 @@ typedef struct rntm_s */ // -// -- rntm_t query ------------------------------------------------------------- +// -- rntm_t query (public API) ------------------------------------------------ // static dim_t bli_rntm_num_threads( rntm_t* rntm ) @@ -87,6 +87,20 @@ static dim_t bli_rntm_pr_ways( rntm_t* rntm ) return bli_rntm_ways_for( BLIS_KR, rntm ); } +// +// -- rntm_t query (internal use only) ----------------------------------------- +// + +static pool_t* bli_rntm_sba_pool( rntm_t* rntm ) +{ + return rntm->sba_pool; +} + +static membrk_t* bli_rntm_membrk( rntm_t* rntm ) +{ + return rntm->membrk; +} + static dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool_t nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); @@ -151,6 +165,16 @@ static void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_ bli_rntm_set_pr_ways_only( 1, rntm ); } +static void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) +{ + rntm->sba_pool = sba_pool; +} + +static void bli_rntm_set_membrk( membrk_t* membrk, rntm_t* rntm ) +{ + rntm->membrk = membrk; +} + static void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); @@ -159,6 +183,10 @@ static void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } +static void bli_rntm_clear_sba_pool( rntm_t* rntm ) +{ + bli_rntm_set_sba_pool( NULL, rntm ); +} // // -- rntm_t modification (public API) ----------------------------------------- @@ -196,12 +224,15 @@ static void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER { .num_threads = -1, \ - .thrloop = { -1, -1, -1, -1, -1, -1 } } \ + .thrloop = { -1, -1, -1, -1, -1, -1 }, \ + .sba_pool = NULL } \ static void bli_rntm_init( rntm_t* rntm ) { bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); + + bli_rntm_clear_sba_pool( rntm ); } // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c new file mode 100644 index 000000000..515482f75 --- /dev/null +++ b/frame/base/bli_sba.c @@ -0,0 +1,190 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// The small block allocator: an apool_t of array_t of pool_t. +static apool_t sba; + +apool_t* bli_sba_query( void ) +{ + return &sba; +} + +// ----------------------------------------------------------------------------- + +void bli_sba_init( void ) +{ + bli_apool_init( BLIS_MALLOC_INTL, BLIS_FREE_INTL, &sba ); +} + +void bli_sba_finalize( void ) +{ + bli_apool_finalize( &sba ); +} + +void* bli_sba_acquire + ( + rntm_t* restrict rntm, + siz_t req_size + ) +{ + void* block; + +#ifdef BLIS_ENABLE_SBA_POOLS + if ( rntm == NULL ) + { + block = bli_malloc_intl( req_size ); + } + else + { + pblk_t pblk; + + // Query the small block pool from the rntm. + pool_t* restrict pool = bli_rntm_sba_pool( rntm ); + + // Query the block_size of the pool_t so that we can request the exact + // size present. + const siz_t block_size = bli_pool_block_size( pool ); + + // Sanity check: Make sure the requested size is no larger than the + // block_size field of the pool. + if ( block_size < req_size ) + { + printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", + ( int )block_size, ( int )req_size ); + bli_abort(); + } + + // Check out a block using the block_size queried above. + bli_pool_checkout_block( block_size, &pblk, pool ); + + // The block address is stored within the pblk_t. + block = bli_pblk_buf( &pblk ); + } +#else + + block = bli_malloc_intl( req_size ); + +#endif + + // Return the address obtained from the pblk_t. + return block; +} + +void bli_sba_release + ( + rntm_t* restrict rntm, + void* restrict block + ) +{ +#ifdef BLIS_ENABLE_SBA_POOLS + if ( rntm == NULL ) + { + bli_free_intl( block ); + } + else + { + pblk_t pblk; + + // Query the small block pool from the rntm. + pool_t* restrict pool = bli_rntm_sba_pool( rntm ); + + // Query the block_size field from the pool. This is not super-important + // for this particular application of the pool_t (that is, the "leaf" + // component of the sba), but it seems like good housekeeping to maintain + // the block_size field of the pblk_t in case its ever needed/read. + const siz_t block_size = bli_pool_block_size( pool ); + + // Embed the block's memory address into a pblk_t, along with the + // block_size queried from the pool. + bli_pblk_set_buf( block, &pblk ); + bli_pblk_set_block_size( block_size, &pblk ); + + // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is + // a local variable since its contents are copied into the pool's internal + // data structure--an array of pblk_t.) + bli_pool_checkin_block( &pblk, pool ); + } +#else + + bli_free_intl( block ); + +#endif +} + +array_t* bli_sba_checkout_array + ( + const siz_t n_threads + ) +{ + #ifndef BLIS_ENABLE_SBA_POOLS + return NULL; + #endif + + return bli_apool_checkout_array( n_threads, &sba ); +} + +void bli_sba_checkin_array + ( + array_t* restrict array + ) +{ + #ifndef BLIS_ENABLE_SBA_POOLS + return; + #endif + + bli_apool_checkin_array( array, &sba ); +} + +void bli_sba_rntm_set_pool + ( + siz_t index, + array_t* restrict array, + rntm_t* restrict rntm + ) +{ + #ifndef BLIS_ENABLE_SBA_POOLS + bli_rntm_set_sba_pool( NULL, rntm ); + return; + #endif + + // Query the pool_t* in the array_t corresponding to index. + pool_t* restrict pool = bli_apool_array_elem( index, array ); + + // Embed the pool_t* into the rntm_t. + bli_rntm_set_sba_pool( pool, rntm ); +} + + diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h new file mode 100644 index 000000000..cf10834e3 --- /dev/null +++ b/frame/base/bli_sba.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2018, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_SBA_H +#define BLIS_SBA_H + +apool_t* bli_sba_query( void ); + +// ----------------------------------------------------------------------------- + +void bli_sba_init( void ); +void bli_sba_finalize( void ); + +array_t* bli_sba_checkout_array + ( + const siz_t n_threads + ); + +void bli_sba_checkin_array + ( + array_t* restrict array + ); + +void bli_sba_rntm_set_pool + ( + siz_t index, + array_t* restrict array, + rntm_t* restrict rntm + ); + +void* bli_sba_acquire + ( + rntm_t* restrict rntm, + siz_t req_size + ); +void bli_sba_release + ( + rntm_t* restrict rntm, + void* restrict block + ); + + +#endif + diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c5c2acc32..6971dd7a1 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -922,15 +922,32 @@ typedef enum // // These headers must be included here (or earlier) because definitions they -// provide are needed in the pool_t and membrk_t structs. +// provide are needed in the pool_t and related structs. #include "bli_pthread.h" #include "bli_malloc.h" +// -- Array type -- + +typedef struct +{ + void* buf; + + siz_t num_elem; + siz_t elem_size; + + //malloc_ft malloc_fp; + //free_ft free_fp; + +} array_t; + + // -- Pool block type -- typedef struct { - void* buf; + void* buf; + siz_t block_size; + } pblk_t; @@ -938,7 +955,7 @@ typedef struct typedef struct { - pblk_t* block_ptrs; + void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; @@ -953,7 +970,19 @@ typedef struct } pool_t; -// -- Memory broker object type -- +// -- small block allocator: Locked pool-of-arrays-of-pools type -- + +typedef struct +{ + bli_pthread_mutex_t mutex; + pool_t pool; + + siz_t def_array_len; + +} apool_t; + + +// -- packing block allocator: Locked set of pools type -- typedef struct membrk_s { @@ -975,7 +1004,6 @@ typedef struct mem_s pblk_t pblk; packbuf_t buf_type; pool_t* pool; - membrk_t* membrk; siz_t size; } mem_t; @@ -1199,7 +1227,6 @@ typedef struct cntx_s pack_t schema_b_panel; pack_t schema_c_panel; - membrk_t* membrk; } cntx_t; @@ -1207,9 +1234,18 @@ typedef struct cntx_s typedef struct rntm_s { + // "External" fields: these may be queried by the end-user. dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; + // "Internal" fields: these should not be exposed to the end-user. + + // The small block pool, which is attached in the l3 thread decorator. + pool_t* sba_pool; + + // The packing block allocator, which is attached in the l3 thread decorator. + membrk_t* membrk; + } rntm_t; @@ -1296,28 +1332,31 @@ typedef enum // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), - // Memory allocator errors - BLIS_INVALID_PACKBUF = (-120), - BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-122), - BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-123), - BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-124), - BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-125), + // Memory errors + BLIS_MALLOC_RETURNED_NULL = (-120), + + // Internal memory pool errors + BLIS_INVALID_PACKBUF = (-130), + BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), + BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), + BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), + BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors - BLIS_EXPECTED_OBJECT_ALIAS = (-130), + BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors - BLIS_INVALID_ARCH_ID = (-140), + BLIS_INVALID_ARCH_ID = (-150), // Blocksize-related errors - BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-150), - BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-151), - BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-152), - BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-153), - BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-154), - BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-155), + BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), + BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), + BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), + BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), + BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), + BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), - BLIS_ERROR_CODE_MAX = (-160) + BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif diff --git a/frame/include/blis.h b/frame/include/blis.h index c1c6d7c6c..3177e6d7b 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -104,6 +105,9 @@ extern "C" { #include "bli_ind.h" #include "bli_membrk.h" #include "bli_pool.h" +#include "bli_array.h" +#include "bli_apool.h" +#include "bli_sba.h" #include "bli_memsys.h" #include "bli_mem.h" #include "bli_part.h" diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 02f3948ce..c83ab2d83 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -94,9 +94,11 @@ void PASTEMAC(opname,imeth) \ cntx_t cntx_l; \ if ( ind == BLIS_3MH || ind == BLIS_4MH ) { cntx_l = *cntx; cntx = &cntx_l; } \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -185,9 +187,11 @@ void PASTEMAC(opname,imeth) \ cntx_t cntx_l; \ if ( ind == BLIS_3MH || ind == BLIS_4MH ) { cntx_l = *cntx; cntx = &cntx_l; } \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -274,9 +278,11 @@ void PASTEMAC(opname,imeth) \ cntx_t cntx_l; \ if ( ind == BLIS_3MH || ind == BLIS_4MH ) { cntx_l = *cntx; cntx = &cntx_l; } \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -348,9 +354,11 @@ void PASTEMAC(opname,imeth) \ _cntx_init() function. */ \ cntx = bli_gks_query_ind_cntx( ind, dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Some induced methods execute in multiple "stages". */ \ for ( i = 0; i < nstage; ++i ) \ @@ -408,9 +416,11 @@ void PASTEMAC(opname,imeth) \ _cntx_init() function. */ \ cntx = bli_gks_query_ind_cntx( ind, dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ { \ /* NOTE: trsm cannot be implemented via any induced method that diff --git a/frame/ind/oapi/bli_l3_ind_oapi.c b/frame/ind/oapi/bli_l3_ind_oapi.c index 78fb0762e..17a5ef38e 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.c +++ b/frame/ind/oapi/bli_l3_ind_oapi.c @@ -56,9 +56,11 @@ void PASTEMAC(opname,imeth) \ num_t dt = bli_obj_dt( c ); \ PASTECH(opname,_oft) func = PASTEMAC(opname,ind_get_avail)( dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ func( alpha, a, b, beta, c, cntx, rntm ); \ } @@ -90,9 +92,11 @@ void PASTEMAC(opname,imeth) \ num_t dt = bli_obj_dt( c ); \ PASTECH(opname,_oft) func = PASTEMAC(opname,ind_get_avail)( dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ func( side, alpha, a, b, beta, c, cntx, rntm ); \ } @@ -122,9 +126,11 @@ void PASTEMAC(opname,imeth) \ num_t dt = bli_obj_dt( c ); \ PASTECH(opname,_oft) func = PASTEMAC(opname,ind_get_avail)( dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ func( alpha, a, beta, c, cntx, rntm ); \ } @@ -153,9 +159,11 @@ void PASTEMAC(opname,imeth) \ num_t dt = bli_obj_dt( b ); \ PASTECH(opname,_oft) func = PASTEMAC(opname,ind_get_avail)( dt ); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ func( side, alpha, a, b, cntx, rntm ); \ } diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 16590789d..826912949 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -61,9 +61,11 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ @@ -103,9 +105,11 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ @@ -139,9 +143,11 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ @@ -174,9 +180,11 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ @@ -208,9 +216,11 @@ void PASTEMAC(opname,imeth) \ /* Obtain a valid (native) context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ - /* Initialize a local runtime with global settings if necessary. */ \ + /* Initialize a local runtime with global settings if necessary. Note + that in the case that a runtime is passed in, we make a local copy. */ \ rntm_t rntm_l; \ - if ( rntm == NULL ) { rntm = &rntm_l; bli_thread_init_rntm( rntm ); } \ + if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \ + else { rntm_l = *rntm; rntm = &rntm_l; } \ \ /* Invoke the operation's front end. */ \ PASTEMAC(opname,_front) \ diff --git a/frame/thread/bli_pthread.c b/frame/thread/bli_pthread.c index d07a39361..03b44a585 100644 --- a/frame/thread/bli_pthread.c +++ b/frame/thread/bli_pthread.c @@ -6,6 +6,7 @@ Copyright (C) 2018, Southern Methodist University Copyright (C) 2018, The University of Texas at Austin + Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -42,39 +43,57 @@ // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. -int bli_pthread_mutex_init( bli_pthread_mutex_t* mutex, - const bli_pthread_mutexattr_t* attr ) +int bli_pthread_mutex_init + ( + bli_pthread_mutex_t* mutex, + const bli_pthread_mutexattr_t* attr + ) { if ( attr ) return EINVAL; InitializeSRWLock( mutex ); return 0; } -int bli_pthread_mutex_destroy( bli_pthread_mutex_t* mutex ) +int bli_pthread_mutex_destroy + ( + bli_pthread_mutex_t* mutex + ) { return 0; } -int bli_pthread_mutex_lock( bli_pthread_mutex_t* mutex ) +int bli_pthread_mutex_lock + ( + bli_pthread_mutex_t* mutex + ) { AcquireSRWLockExclusive( mutex ); return 0; } -int bli_pthread_mutex_trylock( bli_pthread_mutex_t* mutex ) +int bli_pthread_mutex_trylock + ( + bli_pthread_mutex_t* mutex + ) { return TryAcquireSRWLockExclusive( mutex ) ? 0 : EBUSY; } -int bli_pthread_mutex_unlock( bli_pthread_mutex_t* mutex ) +int bli_pthread_mutex_unlock + ( + bli_pthread_mutex_t* mutex + ) { ReleaseSRWLockExclusive( mutex ); return 0; } -static BOOL bli_init_once_wrapper( bli_pthread_once_t* once, - void* param, - void** context) +static BOOL bli_init_once_wrapper + ( + bli_pthread_once_t* once, + void* param, + void** context + ) { ( void )once; ( void )context; @@ -83,33 +102,49 @@ static BOOL bli_init_once_wrapper( bli_pthread_once_t* once, return TRUE; } -void bli_pthread_once( bli_pthread_once_t* once, void (*init)(void) ) +void bli_pthread_once + ( + bli_pthread_once_t* once, + void (*init)(void) + ) { InitOnceExecuteOnce( once, bli_init_once_wrapper, init, NULL ); } -int bli_pthread_cond_init( bli_pthread_cond_t* cond, - const bli_pthread_condattr_t* attr ) +int bli_pthread_cond_init + ( + bli_pthread_cond_t* cond, + const bli_pthread_condattr_t* attr + ) { if ( attr ) return EINVAL; InitializeConditionVariable( cond ); return 0; } -int bli_pthread_cond_destroy( bli_pthread_cond_t* cond ) +int bli_pthread_cond_destroy + ( + bli_pthread_cond_t* cond + ) { ( void )cond; return 0; } -int bli_pthread_cond_wait( bli_pthread_cond_t* cond, - bli_pthread_mutex_t* mutex ) +int bli_pthread_cond_wait + ( + bli_pthread_cond_t* cond, + bli_pthread_mutex_t* mutex + ) { if ( !SleepConditionVariableSRW( cond, mutex, INFINITE, 0 ) ) return EAGAIN; return 0; } -int bli_pthread_cond_broadcast( bli_pthread_cond_t* cond ) +int bli_pthread_cond_broadcast + ( + bli_pthread_cond_t* cond + ) { WakeAllConditionVariable( cond ); return 0; @@ -120,19 +155,26 @@ typedef struct void* (*start_routine)( void* ); void* param; void** retval; + } bli_thread_param; -static DWORD bli_thread_func( void* param_ ) +static DWORD bli_thread_func + ( + void* param_ + ) { bli_thread_param* param = param_; *param->retval = param->start_routine( param->param ); return 0; } -int bli_pthread_create( bli_pthread_t* thread, - const bli_pthread_attr_t* attr, - void* (*start_routine)(void*), - void* arg ) +int bli_pthread_create + ( + bli_pthread_t* thread, + const bli_pthread_attr_t* attr, + void* (*start_routine)(void*), + void* arg + ) { if ( attr ) return EINVAL; bli_thread_param param = { start_routine, arg, &thread->retval }; @@ -141,8 +183,11 @@ int bli_pthread_create( bli_pthread_t* thread, return 0; } -int bli_pthread_join( bli_pthread_t thread, - void** retval ) +int bli_pthread_join + ( + bli_pthread_t thread, + void** retval + ) { if ( !WaitForSingleObject( thread.handle, INFINITE ) ) return EAGAIN; if ( retval ) *retval = thread.retval; diff --git a/frame/thread/bli_thrcomm.c b/frame/thread/bli_thrcomm.c index 449561b4c..b1419bbb2 100644 --- a/frame/thread/bli_thrcomm.c +++ b/frame/thread/bli_thrcomm.c @@ -36,18 +36,18 @@ void* bli_thrcomm_bcast ( - thrcomm_t* comm, dim_t id, - void* to_send + void* to_send, + thrcomm_t* comm ) { if ( comm == NULL || comm->n_threads == 1 ) return to_send; if ( id == 0 ) comm->sent_object = to_send; - bli_thrcomm_barrier( comm, id ); + bli_thrcomm_barrier( id, comm ); void* object = comm->sent_object; - bli_thrcomm_barrier( comm, id ); + bli_thrcomm_barrier( id, comm ); return object; } @@ -71,7 +71,7 @@ void* bli_thrcomm_bcast #endif -void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) { // Return early if the comm is NULL or if there is only one // thread participating. diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index bbd62f00e..ef1e5a12a 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -55,14 +55,14 @@ static dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) // Thread communicator prototypes. -thrcomm_t* bli_thrcomm_create( dim_t n_threads ); -void bli_thrcomm_free( thrcomm_t* comm ); -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ); +thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); +void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t thread_id ); -void* bli_thrcomm_bcast( thrcomm_t* comm, dim_t inside_id, void* to_send ); +void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); +void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); -void bli_thrcomm_barrier_atomic( thrcomm_t* comm, dim_t t_id ); +void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index f11a92b56..4423f83ff 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -37,28 +37,35 @@ #ifdef BLIS_ENABLE_OPENMP -thrcomm_t* bli_thrcomm_create( dim_t n_threads ) +thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif - thrcomm_t* comm = bli_malloc_intl( sizeof(thrcomm_t) ); - bli_thrcomm_init( comm, n_threads ); + thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); + + bli_thrcomm_init( n_threads, comm ); return comm; } -void bli_thrcomm_free( thrcomm_t* comm ) +void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); - bli_free_intl( comm ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_thrcomm_free(): " ); + #endif + + bli_sba_release( rntm, comm ); } #ifndef BLIS_TREE_BARRIER -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -75,7 +82,7 @@ void bli_thrcomm_cleanup( thrcomm_t* comm ) //'Normal' barrier for openmp //barrier routine taken from art of multicore programming -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) @@ -97,12 +104,12 @@ void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) while ( *listener == my_sense ) {} } #endif - bli_thrcomm_barrier_atomic( comm, t_id ); + bli_thrcomm_barrier_atomic( t_id, comm ); } #else -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -176,7 +183,7 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) return; } -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { bli_thrcomm_tree_barrier( comm->barriers[t_id] ); } @@ -207,6 +214,7 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) #endif + // Define a dummy function bli_l3_thread_entry(), which is needed in the // pthreads version, so that when building Windows DLLs (with OpenMP enabled // or no multithreading) we don't risk having an unresolved symbol. @@ -240,22 +248,54 @@ void bli_l3_thread_decorator bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // Query the total number of threads from the rntm_t object. - dim_t n_threads = bli_rntm_num_threads( rntm ); - - // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); #ifdef PRINT_THRINFO thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); #endif + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_membrk_rntm_set_membrk( rntm ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); + _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t tid = omp_get_thread_num(); + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Query the thread's id from OpenMP. + const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. - bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm ); + bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + obj_t a_t, b_t, c_t; cntl_t* cntl_use; @@ -272,10 +312,10 @@ void bli_l3_thread_decorator // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, cntl, &cntl_use ); + &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( tid, gl_comm, rntm, cntl_use, &thread ); + bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); func ( @@ -285,21 +325,20 @@ void bli_l3_thread_decorator beta, &c_t, cntx, - rntm, + rntm_p, cntl_use, thread ); // Free the thread's local control tree. - bli_l3_cntl_free( cntl_use, thread ); + bli_l3_cntl_free( rntm_p, cntl_use, thread ); #ifdef PRINT_THRINFO threads[tid] = thread; #else // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( thread ); + bli_l3_thrinfo_free( rntm_p, thread ); #endif - } // We shouldn't free the global communicator since it was already freed @@ -308,9 +347,14 @@ void bli_l3_thread_decorator #ifdef PRINT_THRINFO bli_l3_thrinfo_print_paths( threads ); - bli_l3_thrinfo_free_paths( threads ); exit(1); + //bli_l3_thrinfo_free_paths( rntm_p, threads ); #endif + + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); } // ----------------------------------------------------------------------------- @@ -348,8 +392,8 @@ void bli_l3_thread_decorator_thread_check // if the number of threads in the current region is 1. If, for // example, BLIS requested 4 threads but only got 3, then we // abort(). - if ( tid == 0 ) - { + //if ( tid == 0 ) + //{ if ( n_threads_real != 1 ) { bli_print_msg( "A different number of threads was " @@ -359,10 +403,10 @@ void bli_l3_thread_decorator_thread_check } //n_threads = 1; // not needed since it has no effect? - bli_thrcomm_init( gl_comm, 1 ); + bli_thrcomm_init( 1, gl_comm ); bli_rntm_set_num_threads_only( 1, rntm ); bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); - } + //} // Synchronize all threads and continue. _Pragma( "omp barrier" ) diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 65955b826..975c5eb88 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -37,32 +37,35 @@ #ifdef BLIS_ENABLE_PTHREADS -thrcomm_t* bli_thrcomm_create( dim_t n_threads ) +thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif - thrcomm_t* comm = bli_malloc_intl( sizeof(thrcomm_t) ); - bli_thrcomm_init( comm, n_threads ); + thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); + + bli_thrcomm_init( n_threads, comm ); + return comm; } -void bli_thrcomm_free( thrcomm_t* comm ) +void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif - bli_free_intl( comm ); + bli_sba_release( rntm, comm ); } #ifdef BLIS_USE_PTHREAD_BARRIER -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -76,14 +79,14 @@ void bli_thrcomm_cleanup( thrcomm_t* comm ) bli_pthread_barrier_destroy( &comm->barrier ); } -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { bli_pthread_barrier_wait( &comm->barrier ); } #else -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads) +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; @@ -104,7 +107,7 @@ void bli_thrcomm_cleanup( thrcomm_t* comm ) //#endif } -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) return; @@ -130,7 +133,7 @@ void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) while( *listener == my_sense ) {} } #endif - bli_thrcomm_barrier_atomic( comm, t_id ); + bli_thrcomm_barrier_atomic( t_id, comm ); } #endif @@ -151,8 +154,9 @@ typedef struct thread_data cntx_t* cntx; rntm_t* rntm; cntl_t* cntl; - dim_t id; + dim_t tid; thrcomm_t* gl_comm; + array_t* array; } thread_data_t; // Entry point for additional threads @@ -172,9 +176,22 @@ void* bli_l3_thread_entry( void* data_void ) cntx_t* cntx = data->cntx; rntm_t* rntm = data->rntm; cntl_t* cntl = data->cntl; - dim_t id = data->id; + dim_t tid = data->tid; + array_t* array = data->array; thrcomm_t* gl_comm = data->gl_comm; + // Create a thread-local copy of the master thread's rntm_t. This is + // necessary since we want each thread to be able to track its own + // small block pool_t as it executes down the function stack. + rntm_t rntm_l = *rntm; + rntm_t* restrict rntm_p = &rntm_l; + + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + bli_sba_rntm_set_pool( tid, array, rntm_p ); + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; @@ -190,10 +207,10 @@ void* bli_l3_thread_entry( void* data_void ) // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, - &a_t, &b_t, &c_t, cntl, &cntl_use ); + &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( id, gl_comm, rntm, cntl_use, &thread ); + bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); func ( @@ -203,16 +220,16 @@ void* bli_l3_thread_entry( void* data_void ) beta, &c_t, cntx, - rntm, + rntm_p, cntl_use, thread ); // Free the thread's local control tree. - bli_l3_cntl_free( cntl_use, thread ); + bli_l3_cntl_free( rntm_p, cntl_use, thread ); // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( thread ); + bli_l3_thrinfo_free( rntm_p, thread ); return NULL; } @@ -243,39 +260,66 @@ void bli_l3_thread_decorator bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // Query the total number of threads from the context. - dim_t n_threads = bli_rntm_num_threads( rntm ); + const dim_t n_threads = bli_rntm_num_threads( rntm ); + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we have the rntm_t.sba_pool field + // initialized and ready for the global communicator creation below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. This will be + // inherited by all of the child threads when they make local copies of + // the rntm below. + bli_membrk_rntm_set_membrk( rntm ); // Allocate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. - bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. - for ( dim_t id = n_threads - 1; 0 <= id; id-- ) + for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) { // Set up thread data for additional threads (beyond thread 0). - datas[id].func = func; - datas[id].family = family; - datas[id].schema_a = schema_a; - datas[id].schema_b = schema_b; - datas[id].alpha = alpha; - datas[id].a = a; - datas[id].b = b; - datas[id].beta = beta; - datas[id].c = c; - datas[id].cntx = cntx; - datas[id].rntm = rntm; - datas[id].cntl = cntl; - datas[id].id = id; - datas[id].gl_comm = gl_comm; + datas[tid].func = func; + datas[tid].family = family; + datas[tid].schema_a = schema_a; + datas[tid].schema_b = schema_b; + datas[tid].alpha = alpha; + datas[tid].a = a; + datas[tid].b = b; + datas[tid].beta = beta; + datas[tid].c = c; + datas[tid].cntx = cntx; + datas[tid].rntm = rntm; + datas[tid].cntl = cntl; + datas[tid].tid = tid; + datas[tid].gl_comm = gl_comm; + datas[tid].array = array; // Spawn additional threads for ids greater than 1. - if ( id != 0 ) - bli_pthread_create( &pthreads[id], NULL, &bli_l3_thread_entry, &datas[id] ); + if ( tid != 0 ) + bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] ); else bli_l3_thread_entry( ( void* )(&datas[0]) ); } @@ -285,15 +329,26 @@ void bli_l3_thread_decorator // (called from the thread entry function). // Thread 0 waits for additional threads to finish. - for ( dim_t id = 1; id < n_threads; id++ ) + for ( dim_t tid = 1; tid < n_threads; tid++ ) { - bli_pthread_join( pthreads[id], NULL ); + bli_pthread_join( pthreads[tid], NULL ); } + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif bli_free_intl( pthreads ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_l3_thread_decorator().pth: " ); + #endif bli_free_intl( datas ); } - #endif diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index a12bd3966..969221e7c 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -38,30 +38,33 @@ #ifndef BLIS_ENABLE_MULTITHREADING //Constructors and destructors for constructors -thrcomm_t* bli_thrcomm_create( dim_t n_threads ) +thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif - thrcomm_t* comm = bli_malloc_intl( sizeof( thrcomm_t ) ); - bli_thrcomm_init( comm, n_threads ); + thrcomm_t* comm = bli_sba_acquire( rntm, sizeof( thrcomm_t ) ); + + bli_thrcomm_init( n_threads, comm ); + return comm; } -void bli_thrcomm_free( thrcomm_t* comm ) +void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; + bli_thrcomm_cleanup( comm ); - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif - bli_free_intl( comm ); + bli_sba_release( rntm, comm ); } -void bli_thrcomm_init( thrcomm_t* comm, dim_t n_threads ) +void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; @@ -76,7 +79,7 @@ void bli_thrcomm_cleanup( thrcomm_t* comm ) if ( comm == NULL ) return; } -void bli_thrcomm_barrier( thrcomm_t* comm, dim_t t_id ) +void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { return; } @@ -112,53 +115,88 @@ void bli_l3_thread_decorator bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // For sequential execution, we use only one thread. - dim_t n_threads = 1; - dim_t id = 0; + const dim_t n_threads = 1; + + // NOTE: The sba was initialized in bli_init(). + + // Check out an array_t from the small block allocator. This is done + // with an internal lock to ensure only one application thread accesses + // the sba at a time. bli_sba_checkout_array() will also automatically + // resize the array_t, if necessary. + array_t* restrict array = bli_sba_checkout_array( n_threads ); + + // Access the pool_t* for thread 0 and embed it into the rntm. We do + // this up-front only so that we can create the global comm below. + bli_sba_rntm_set_pool( 0, array, rntm ); + + // Set the packing block allocator field of the rntm. + bli_membrk_rntm_set_membrk( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. - thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); - cntl_t* cntl_use; - thrinfo_t* thread; - // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't - // need to alias objects for A, B, and C since they were already aliased - // in bli_*_front(). However, we may add aliasing here in the future so - // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations - // consistently providing local aliases, we can then eliminate aliasing - // elsewhere. + { + // NOTE: We don't need to create another copy of the rntm_t since + // it was already copied in one of the high-level oapi functions. + rntm_t* restrict rntm_p = rntm; - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, schema_a, schema_b, - a, b, c, cntl, &cntl_use ); + cntl_t* cntl_use; + thrinfo_t* thread; - // Create the root node of the thread's thrinfo_t structure. - bli_l3_thrinfo_create_root( id, gl_comm, rntm, cntl_use, &thread ); + const dim_t tid = 0; - func - ( - alpha, - a, - b, - beta, - c, - cntx, - rntm, - cntl_use, - thread - ); + // Use the thread id to access the appropriate pool_t* within the + // array_t, and use it to set the sba_pool field within the rntm_t. + // If the pool_t* element within the array_t is NULL, it will first + // be allocated/initialized. + // NOTE: This is commented out because, in the single-threaded case, + // this is redundant since it's already been done above. + //bli_sba_rntm_set_pool( tid, array, rntm_p ); - // Free the thread's local control tree. - bli_l3_cntl_free( cntl_use, thread ); + // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't + // need to alias objects for A, B, and C since they were already aliased + // in bli_*_front(). However, we may add aliasing here in the future so + // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations + // consistently providing local aliases, we can then eliminate aliasing + // elsewhere. - // Free the current thread's thrinfo_t structure. - bli_l3_thrinfo_free( thread ); + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( family, schema_a, schema_b, + a, b, c, rntm_p, cntl, &cntl_use ); + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + rntm_p, + cntl_use, + thread + ); + + // Free the thread's local control tree. + bli_l3_cntl_free( rntm_p, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( rntm_p, thread ); + } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). -} + // Check the array_t back into the small block allocator. Similar to the + // check-out, this is done using a lock embedded within the sba to ensure + // mutual exclusion. + bli_sba_checkin_array( array ); +} #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index ed68897e2..58ba57e81 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -46,7 +46,7 @@ static rntm_t global_rntm; void bli_thread_init( void ) { - bli_thrcomm_init( &BLIS_SINGLE_COMM, 1 ); + bli_thrcomm_init( 1, &BLIS_SINGLE_COMM ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 0af09e0e6..0dcaae2d8 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -37,6 +37,7 @@ thrinfo_t* bli_thrinfo_create ( + rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, @@ -45,11 +46,11 @@ thrinfo_t* bli_thrinfo_create thrinfo_t* sub_node ) { - #ifdef ENABLE_MEM_DEBUG + #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_create(): " ); #endif - thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); + thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); bli_thrinfo_init ( @@ -99,11 +100,47 @@ void bli_thrinfo_init_single ); } +void bli_thrinfo_free + ( + rntm_t* rntm, + thrinfo_t* thread + ) +{ + if ( thread == NULL || + thread == &BLIS_PACKM_SINGLE_THREADED || + thread == &BLIS_GEMM_SINGLE_THREADED + ) return; + + thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); + + // Free the communicators, but only if the current thrinfo_t struct + // is marked as needing them to be freed. The most common example of + // thrinfo_t nodes NOT marked as needing their comms freed are those + // associated with packm thrinfo_t nodes. + if ( bli_thrinfo_needs_free_comm( thread ) ) + { + // The ochief always frees his communicator, and the ichief free its + // communicator if we are at the leaf node. + if ( bli_thread_am_ochief( thread ) ) + bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) ); + } + + // Recursively free all children of the current thrinfo_t. + bli_thrinfo_free( rntm, thrinfo_sub_node ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_thrinfo_free(): " ); + #endif + + // Free the thrinfo_t struct. + bli_sba_release( rntm, thread ); +} + // ----------------------------------------------------------------------------- #include "assert.h" -#define BLIS_NUM_STATIC_COMMS 18 +#define BLIS_NUM_STATIC_COMMS 80 thrinfo_t* bli_thrinfo_create_for_cntl ( @@ -118,12 +155,12 @@ thrinfo_t* bli_thrinfo_create_for_cntl thrinfo_t* thread_chl; - bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); + const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); - dim_t parent_nt_in = bli_thread_num_threads( thread_par ); - dim_t parent_n_way = bli_thread_n_way( thread_par ); - dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); - dim_t parent_work_id = bli_thread_work_id( thread_par ); + const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); + const dim_t parent_n_way = bli_thread_n_way( thread_par ); + const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + const dim_t parent_work_id = bli_thread_work_id( thread_par ); dim_t child_nt_in; dim_t child_comm_id; @@ -162,7 +199,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl // object and store it in the array element corresponding to the // parent's work id. if ( child_comm_id == 0 ) - new_comms[ parent_work_id ] = bli_thrcomm_create( child_nt_in ); + new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); bli_thread_obarrier( thread_par ); @@ -170,6 +207,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl // that was created by their chief, as identified by parent_work_id. thread_chl = bli_thrinfo_create ( + rntm, new_comms[ parent_work_id ], child_comm_id, child_n_way, @@ -259,6 +297,7 @@ thrinfo_t* bli_thrinfo_rgrow // freed when thread_seg, or one of its descendents, is freed. thread_cur = bli_thrinfo_create ( + rntm, bli_thrinfo_ocomm( thread_seg ), bli_thread_ocomm_id( thread_seg ), bli_cntl_calc_num_threads_in( rntm, cntl_cur ), diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 934b4ff96..44e8d73cc 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -122,12 +122,12 @@ static void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) static void* bli_thread_obroadcast( thrinfo_t* t, void* p ) { - return bli_thrcomm_bcast( t->ocomm, t->ocomm_id, p ); + return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } static void bli_thread_obarrier( thrinfo_t* t ) { - bli_thrcomm_barrier( t->ocomm, t->ocomm_id ); + bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } @@ -137,6 +137,7 @@ static void bli_thread_obarrier( thrinfo_t* t ) thrinfo_t* bli_thrinfo_create ( + rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, @@ -161,6 +162,12 @@ void bli_thrinfo_init_single thrinfo_t* thread ); +void bli_thrinfo_free + ( + rntm_t* rntm, + thrinfo_t* thread + ); + // ----------------------------------------------------------------------------- thrinfo_t* bli_thrinfo_create_for_cntl diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c index d87be6091..3be61f882 100644 --- a/ref_kernels/bli_cntx_ref.c +++ b/ref_kernels/bli_cntx_ref.c @@ -458,7 +458,7 @@ void GENBARNAME(cntx_init) //bli_cntx_set_anti_pref( FALSE, cntx ); - bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx ); + //bli_cntx_set_membrk( bli_membrk_query(), cntx ); } // ----------------------------------------------------------------------------- diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 46f157bf4..6d2f028d2 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -320,7 +320,7 @@ void libblis_test_gemmtrsm_ukr_experiment // allocated. void* buf_ap = bli_obj_buffer( &ap ); void* buf_bp = bli_obj_buffer( &bp ); - bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, BLIS_MR, BLIS_KR, &a, &ap, cntx ); bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, @@ -351,8 +351,10 @@ void libblis_test_gemmtrsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, &a11p ); -//bli_printm( "a", &a, "%4.1f", "" ); -//bli_printm( "ap", &ap, "%4.1f", "" ); +#if 0 +bli_printm( "a", &a, "%5.2f", "" ); +bli_printm( "ap", &ap, "%5.2f", "" ); +#endif // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -493,6 +495,10 @@ void libblis_test_gemmtrsm_ukr_check bli_gemv( &BLIS_ONE, b11, &t, &BLIS_ZERO, &v ); +#if 0 +bli_printm( "a11", a11, "%5.2f", "" ); +#endif + // Restore the diagonal of a11 to its original, un-inverted state // (needed for trsv). bli_invertd( a11 ); diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index caf1c2e17..96c705c9a 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -84,7 +84,7 @@ int main( int argc, char** argv ) libblis_test_thread_decorator( ¶ms, &ops ); // Finalize libblis. - //bli_finalize(); + bli_finalize(); // Return peacefully. return 0; @@ -126,13 +126,25 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif bli_pthread_t* pthread = bli_malloc_intl( sizeof( bli_pthread_t ) * nt ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif thread_data_t* tdata = bli_malloc_intl( sizeof( thread_data_t ) * nt ); // Allocate a mutex for the threads to share. //bli_pthread_mutex_t* mutex = bli_malloc_intl( sizeof( bli_pthread_mutex_t ) ); // Allocate a barrier for the threads to share. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif bli_pthread_barrier_t* barrier = bli_malloc_intl( sizeof( bli_pthread_barrier_t ) ); // Initialize the mutex. @@ -175,8 +187,20 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops ) bli_pthread_barrier_destroy( barrier ); // Free the pthread-related memory. + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif bli_free_intl( pthread ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif bli_free_intl( tdata ); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "libblis_test_thread_decorator(): " ); + #endif //bli_free_intl( mutex ); bli_free_intl( barrier ); } @@ -837,8 +861,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "Max stack buffer size (bytes) %d\n", ( int )bli_info_get_stack_buf_max_size() ); libblis_test_fprintf_c( os, "Page size (bytes) %d\n", ( int )bli_info_get_page_size() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "memory pools for pack buffers\n" ); - libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_packbuf_pools() ); + libblis_test_fprintf_c( os, "memory pools\n" ); + libblis_test_fprintf_c( os, " enabled for packing blocks? %d\n", ( int )bli_info_get_enable_pba_pools() ); + libblis_test_fprintf_c( os, " enabled for small blocks? %d\n", ( int )bli_info_get_enable_sba_pools() ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "memory alignment (bytes) \n" ); libblis_test_fprintf_c( os, " stack address %d\n", ( int )bli_info_get_stack_buf_align_size() ); @@ -2589,6 +2614,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c +#if 0 cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { bool_t does_inv_diag; @@ -2600,6 +2626,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia // Create a control tree node for the packing operation. cntl_t* cntl = bli_packm_cntl_create_node ( + NULL, // we don't need the small block allocator from the runtime. NULL, // func ptr is not referenced b/c we don't call via l3 _int(). bli_packm_blk_var1, bmult_id_m, @@ -2625,7 +2652,7 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia // mem_t entry later on. return cntl; } - +#endif void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ) diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 5a48fa5fe..5476e1daf 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -171,6 +171,7 @@ void libblis_test_trsm_ukr_experiment num_t datatype; dim_t m, n; + inc_t ldap, ldbp; char sc_a = 'c'; char sc_b = 'r'; @@ -195,6 +196,11 @@ void libblis_test_trsm_ukr_experiment m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); + // Also query PACKMR and PACKNR as the leading dimensions to ap and bp, + // respectively. + ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); + ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); + // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -232,6 +238,7 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); +#if 0 // Create pack objects for a and b, and pack them to ap and bp, // respectively. cntl_t* cntl_a = libblis_test_pobj_create @@ -254,17 +261,52 @@ void libblis_test_trsm_ukr_experiment &b, &bp, cntx ); +#endif + + // Create the packed objects. Use packmr and packnr as the leading + // dimensions of ap and bp, respectively. + bli_obj_create( datatype, m, m, 1, ldap, &ap ); + bli_obj_create( datatype, m, n, ldbp, 1, &bp ); + + // Set up the objects for packing. Calling packm_init_pack() does everything + // except checkout a memory pool block and save its address to the obj_t's. + // However, it does overwrite the buffer field of packed object with that of + // the source object. So, we have to save the buffer address that was + // allocated. + void* buf_ap = bli_obj_buffer( &ap ); + void* buf_bp = bli_obj_buffer( &bp ); + bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_MR, BLIS_KR, &a, &ap, cntx ); + bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS, + BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER, + BLIS_KR, BLIS_NR, &b, &bp, cntx ); + bli_obj_set_buffer( buf_ap, &ap ); + bli_obj_set_buffer( buf_bp, &bp ); + + // Set the diagonal offset of ap. + bli_obj_set_diag_offset( 0, &ap ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _ukernel() wrapper needs this information to // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, &ap ); + // Pack the data from the source objects. + bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); + +#if 0 +bli_printm( "a", &a, "%5.2f", "" ); +bli_printm( "ap", &ap, "%5.2f", "" ); +#endif + // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + //bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c ); @@ -282,15 +324,17 @@ void libblis_test_trsm_ukr_experiment if ( bli_obj_is_complex( &b ) ) *perf *= 4.0; // Perform checks. - libblis_test_trsm_ukr_check( params, side, &a, &c, &b, resid ); + libblis_test_trsm_ukr_check( params, side, &ap, &c, &b, resid ); // Zero out performance and residual if output matrix is empty. - libblis_test_check_empty_problem( &c, perf, resid ); + //libblis_test_check_empty_problem( &c, perf, resid ); +#if 0 // Free the control tree nodes and release their cached mem_t entries // back to the memory broker. - bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); - bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); +#endif // Free the test objects. bli_obj_free( &a ); @@ -392,6 +436,14 @@ void libblis_test_trsm_ukr_check bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &v ); +#if 0 +bli_printm( "a11", a, "%5.2f", "" ); +#endif + + // Restore the diagonal of a11 to its original, un-inverted state + // (needed for trsv). + bli_invertd( a ); + if ( bli_is_left( side ) ) { bli_gemv( &BLIS_ONE, b_orig, &t, &BLIS_ZERO, &w );