diff --git a/CHANGELOG b/CHANGELOG index 539067456..a361ceac3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,1054 @@ -commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (HEAD -> master, tag: 0.2.0) +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:34 2016 -0500 + + Version file update (0.2.1) + +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) +Merge: 8696987 6f71cd3 +Author: Field G. Van Zee +Date: Wed Oct 5 13:35:01 2016 -0500 + + Merge branch 'compose' + +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) +Merge: c0630c4 8d55033 +Author: Field G. Van Zee +Date: Tue Oct 4 15:53:46 2016 -0500 + + Merge pull request #94 from flame/distcomm + + Implemented distributed thrinfo_t management. + +commit 86969873b5b861966d717d8f9f370af39e3d9de6 +Author: Field G. Van Zee +Date: Tue Oct 4 14:24:59 2016 -0500 + + Reclassified amaxv operation as a level-1v kernel. + + Details: + - Moved amaxv from being a utility operation to being a level-1v operation. + This includes the establishment of a new amaxv kernel to live beside all + of the other level-1v kernels. + - Added two new functions to bli_part.c: + bli_acquire_mij() + bli_acquire_vi() + The first acquires a scalar object for the (i,j) element of a matrix, + and the second acquires a scalar object for the ith element of a vector. + - Added integer support to bli_getsc level-0 operation. This involved + adding integer support to the bli_*gets level-0 scalar macros. + - Added a new test module to test amaxv as a level-1v operation. The test + module works by comparing the value identified by bli_amaxv() to the + the value found from a reference-like code local to the test module + source file. In other words, it (intentionally) does not guarantee the + same index is found; only the same value. This allows for different + implementations in the case where a vector contains two or more elements + containing exactly the same floating point value (or values, in the case + of the complex domain). + - Removed the directory frame/include/old/. + +commit 8d55033c966feed99fcca2a58017c3ab5b1646dc (origin/distcomm) +Author: Field G. Van Zee +Date: Tue Sep 27 15:20:58 2016 -0500 + + Implemented distributed thrinfo_t management. + + Details: + - Implemented Ricardo Magana's distributed thread info/communicator + management. Rather that fully construct the thrinfo_t structures, from + root to leaf, prior to spawning threads, the threads individually + construct their thrinfo_t trees (or, chains), and do so incrementally, + as needed, reusing the same structure nodes during subsequent blocked + variant iterations. This required moving the initial creation of the + thrinfo_t structure (now, the root nodes) from the _front() functions + to the bli_l3_thread_decorator(). The incremental "growing" of the tree + is performed in the internal back-end (ie: _int()) function, and so + mostly invisible. Also, the incremental growth of the thrinfo_t tree is + done as a function of the current and parent control tree nodes (as well + as the parent thrinfo_t node), further reinforcing the parallel + relationship between the two data structures. + - Removed the "inner" communicator from thrinfo_t structure definition, + as well as its id. Changed all APIs accordingly. Renamed + bli_thrinfo_needs_free_comms() to bli_thrinfo_needs_free_comm(). + - Defined bli_l3_thrinfo_print_paths(), which prints the information + in an array of thrinfo_t* structure pointers. (Used only as a + debugging/verification tool.) + - Deprecated the following thrinfo_t creation functions: + bli_packm_thrinfo_create() + bli_l3_thrinfo_create() + because they are no longer used. bli_thrinfo_create() is now called + directly when creating thrinfo_t nodes. + +commit fd04869ae4d4a3b0ebb9052557c296456bce7c0d +Author: Field G. Van Zee +Date: Tue Sep 27 14:14:11 2016 -0500 + + Changed configure's 'omp' threading to 'openmp'. + + Details: + - Changed the configure script so that the expected string argument to the + -t (or --enable-threading=) option that enables OpenMP multithreading is + 'openmp'. The previous expected string, 'omp', is still supported but + should be considered deprecated. + +commit 9424af87209e4e435e2e742430945152690170b0 +Merge: efa7341 c0630c4 +Author: Field G. Van Zee +Date: Tue Sep 27 12:51:08 2016 -0500 + + Merge branch 'compose' + +commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e +Merge: 121c39d e1453f6 +Author: Field G. Van Zee +Date: Fri Sep 16 11:01:57 2016 -0500 + + Merge pull request #92 from ShadenSmith/readme_fix + + Fixes broken URL in README.md + +commit e1453f68f6afd90ae9a29b7a5faa46aa79bbf741 +Author: Shaden Smith +Date: Fri Sep 16 09:29:28 2016 -0500 + + Fixes broken URL in README.md + +commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +Author: Field G. Van Zee +Date: Mon Sep 12 13:59:02 2016 -0500 + + Added debugging printf()'s to bli_l3_thrinfo.c. + + Details: + - Added optional printf() statements to print out thread communicator + info as the thrinfo_t structure is built in bli_l3_thrinfo.c. + - Minor changes to frame/thread/bli_thrinfo.h. + +commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 +Merge: 3550981 121c39d +Author: Field G. Van Zee +Date: Tue Sep 6 15:47:13 2016 -0500 + + Merge branch 'master' into compose + +commit 121c39d455f2db6f7ce6802ba7f73ad5e088c68c +Author: Field G. Van Zee +Date: Mon Sep 5 13:11:42 2016 -0500 + + Added complex gemm micro-kernels for haswell. + + Details: + - Defined cgemm (3x8) and zgemm (3x4) micro-kernels for haswell-based + architectures. As with their real domain brethren, these kernels perfer + row storage, (though this doesn't affect most users due to high-level + optimizations in most level-3 operations that induce a transpose to + whatever storage preference the kernel may have). + +commit 35509818cbea1598b123421f81c42120889a03c3 +Author: Field G. Van Zee +Date: Wed Aug 31 17:34:15 2016 -0500 + + Added, moved some thread barriers. + + Details: + - Removed thread barriers from the end of the loop bodies of + bli_gemm_blk_var1(), bli_gemm_blk_var2(), bli_trsm_blk_var1(), + and bli_trsm_blk_var2(). + - Moved the thread barrier at the end of bli_packm_int() to the + end of bli_l3_packm(), and added missing barriers to that function. + - Removed the no longer necessary (and now incorrect) ochief guard + in bli_gemm3m3_packa() on the bli_obj_scalar_reset() on C. + - Thanks to Tyler Smith for help with these changes. + +commit abd61f9fa75d77a96d1491b3e035451ee73238fe +Author: Field G. Van Zee +Date: Tue Aug 30 12:34:19 2016 -0500 + + Updated BLIS4 TOMS citation in README.md. + +commit 701b9aa3ff028decbf90efac0dca5bd64fe26269 +Author: Field G. Van Zee +Date: Fri Aug 26 19:04:45 2016 -0500 + + Redesigned control tree infrastructure. + + Details: + - Altered control tree node struct definitions so that all nodes have the + same struct definition, whose primary fields consist of a blocksize id, + a variant function pointer, a pointer to an optional parameter struct, + and a pointer to a (single) sub-node. This unified control tree type is + now named cntl_t. + - Changed the way control tree nodes are connected, and what computation + they represent, such that, for example, packing operations are now + associated with nodes that are "inline" in the tree, rather than off- + shoot braches. The original tree for the classic Goto gemm algorithm was + expressed (roughly) as: + + blk_var2 -> blk_var3 -> blk_var1 -> ker_var2 + | | + -> packb -> packa + + and now, the same tree would look like: + + blk_var2 -> blk_var3 -> packb -> blk_var1 -> packa -> ker_var2 + + Specifically, the packb and packa nodes perform their respective packing + operations and then recurse (without any loop) to a subproblem. This means + there are now two kinds of level-3 control tree nodes: partitioning and + non-partitioning. The blocked variants are members of the former, because + they iteratively partition off submatrices and perform suboperations on + those partitions, while the packing variants belong to the latter group. + (This change has the effect of allowing greatly simplified initialization + of the nodes, which previously involved setting many unused node fields to + NULL.) + - Changed the way thrinfo_t tree nodes are arranged to mirror the new + connective structure of control trees. That is, packm nodes are no longer + off-shoot branches of the main algorithmic nodes, but rather connected + "inline". + - Simplified control tree creation functions. Partitioning nodes are created + concisely with just a few fields needing initialization. By contrast, the + packing nodes require additional parameters, which are stored in a + packm-specific struct that is tracked via the optional parameters pointer + within the control tree struct. (This parameter struct must always begin + with a uint64_t that contains the byte size of the struct. This allows + us to use a generic function to recursively copy control trees.) gemm, + herk, and trmm control tree creation continues to be consolidated into + a single function, with the operation family being used to select + among the parameter-agnostic macro-kernel wrappers. A single routine, + bli_cntl_free(), is provided to free control trees recursively, whereby + the chief thread within a groups release the blocks associated with + mem_t entries back to the memory broker from which they were acquired. + - Updated internal back-ends, e.g. bli_gemm_int(), to query and call the + function pointer stored in the current control tree node (rather than + index into a local function pointer array). Before being invoked, these + function pointers are first cast to a gemm_voft (for gemm, herk, or trmm + families) or trsm_voft (for trsm family) type, which is defined in + frame/3/bli_l3_var_oft.h. + - Retired herk and trmm internal back-ends, since all execution now flows + through gemm or trsm blocked variants. + - Merged forwards- and backwards-moving variants by querying the direction + from routines as a function of the variant's matrix operands. gemm and + herk always move forward, while trmm and trsm move in a direction that + is dependent on which operand (a or b) is triangular. + - Added functions bli_thread_get_range_mdim(), bli_thread_get_range_ndim(), + each of which takes additional arguments and hides complexity in managing + the difference between the way ranges are computed for the four families + of operations. + - Simplified level-3 blocked variants according to the above changes, so that + the only steps taken are: + 1. Query partitioning direction (forwards or backwards). + 2. Prune unreferenced regions, if they exist. + 3. Determine the thread partitioning sub-ranges. + + 4. Determine the partitioning blocksize (passing in the partitioning + direction) + 5. Acquire the curren iteration's partitions for the matrices affected + by the current variants's partitioning dimension (m, k, n). + 6. Call the subproblem. + + - Instantiate control trees once per thread, per operation invocation. + (This is a change from the previous regime in which control trees were + treated as stateless objects, initialized with the library, and shared + as read-only objects between threads.) This once-per-thread allocation + is done primarily to allow threads to use the control tree as as place + to cache certain data for use in subsequent loop iterations. Presently, + the only application of this caching is a mem_t entry for the packing + blocks checked out from the memory broker (allocator). If a non-NULL + control tree is passed in by the (expert) user, then the tree is copied + by each thread. This is done in bli_l3_thread_decorator(), in + bli_thrcomm_*.c. + - Added a new field to the context, and opid_t which tracks the "family" + of the operation being executed. For example, gemm, hemm, and symm are + all part of the gemm family, while herk, syrk, her2k, and syr2k are + all part of the herk family. Knowing the operation's family is necessary + when conditionally executing the internal (beta) scalar reset on on + C in blocked variant 3, which is needed for gemm and herk families, + but must not be performed for the trmm family (because beta has only + been applied to the current row-panel of C after the first rank-kc + iteration). + - Reexpressed 3m3 induced method blocked variant in frame/3/gemm/ind + to comform with the new control tree design, and renamed the macro- + kernel codes corresponding to 3m2 and 4m1b. + - Renamed bli_mem.c (and its APIs) to bli_memsys.c, and renamed/relocated + bli_mem_macro_defs.h from frame/include to frame/base/bli_mem.h. + - Renamed/relocated bli_auxinfo_macro_defs.h from frame/include to + frame/base/bli_auxinfo.h. + - Fixed a minor bug whereby the storage-to-ukr-preference matching + optimization in the various level-3 front-ends was not being applied + properly when the context indicated that execution would be via an + induced method. (Before, we always checked the native micro-kernel + corresponding to the datatype being executed, whereas now we check + the native micro-kernel corresponding to the datatype's real projection, + since that is the micro-kernel that is actually used by induced methods. + - Added an option to the testsuite to skip the testing of native level-3 + complex implementations. Previously, it was always tested, provided that + the c/z datatypes were enabled. However, some configurations use + reference micro-kernels for complex datatypes, and testing these + implementations can slow down the testsuite considerably. + +commit 73517f522b69de429dd7f3df60a70c068149ab28 +Merge: c6f5c21 50293da +Author: Field G. Van Zee +Date: Tue Aug 23 13:46:59 2016 -0500 + + Merge branch 'master' into compose + +commit 50293da38d5f2b7be9bbc94b9e85aacb6a10f672 +Author: Field G. Van Zee +Date: Tue Aug 23 13:38:36 2016 -0500 + + Avoid compiling BLAS/CBLAS files when disabled. + + Details: + - Updated the top-level Makefile, build/config.mk.in template, and + configure script so that object files corresponding to source files + belonging to the BLAS compatibility layer are not compiled (or archived) + when the compatibility layer is disabled. (Same for CBLAS.) Thanks + to Devin Matthews for suggesting this optimization. + - Slight change to the way configure handles internal variables. Instead + of converting (overwriting) some, such as enable_blas2blis and + enable_cblas, from a "yes" or "no" to a "1" or "0" value, the latter are + now stored in new variables that live alongside the originals (with the + suffix "_01"). This is convenient since some values need to be + sed-substituted into the config.mk.in template, which requires "yes" or + "no", while some need to be written to the bli_config.h.in template, + which requires "0" or "1". + +commit c6f5c215ee793d03ea834469fc2adc53feaffc42 +Merge: d52cb76 16a4c7a +Author: Field G. Van Zee +Date: Mon Aug 22 17:33:02 2016 -0500 + + Merge branch 'master' into compose + +commit 16a4c7a823d60707ed9272f5d36e5c5d54c0ba4b +Author: Field G. Van Zee +Date: Fri Aug 19 11:38:36 2016 -0500 + + Fixed bugs in bli_mutex_init() and friends. + + Details: + - Fixed a couple of bugs that affected OpenMP and POSIX threads + configurations that resulted in compiler errors and warnings due + to type mismatch, and in the case of pthreads, a missing function + argument. The bugs are fairly recent, introduced in a017062. + +commit d52cb7671509592a8078729477b40b60380518a2 +Merge: 95abea4 c31b1e7 +Author: Field G. Van Zee +Date: Wed Jul 27 16:04:55 2016 -0500 + + Merge branch 'master' into compose + +commit c31b1e7b9d659b96433a87e5aecb90e457a104cc +Author: Field G. Van Zee +Date: Wed Jul 27 15:58:07 2016 -0500 + + Relax alignment restrictions for sandybridge ukrs. + + Details: + - Relaxed the base pointer and leading dimension alignment restrictions + in the sandybridge gemm microkernels, allowing the use of vmovups/vmovupd + instead of vmovaps/vmovapd. These change mimic those made to the haswell + microkernels in e0d2fa0 and ee2c139. + - Updated testsuite modules as well as standalone test drivers in 'test' + directory to use DBL_MAX as the initial time candidate. Thanks to Devin + Matthews for suggesting this change. + - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). + - Minor update (vis-a-vis contexts) to driver code in test/3m4m. + +commit 95abea46f86816fddfc9ff0abfa52880801461be +Merge: d0dfe5b a017062 +Author: Field G. Van Zee +Date: Sat Jul 23 15:38:33 2016 -0500 + + Merge branch 'master' into compose + +commit a017062fdf763037da9d971a028bb07d47aa1c8a +Author: Field G. Van Zee +Date: Fri Jul 22 17:02:59 2016 -0500 + + Integrated "memory broker" (membrk_t) abstraction. + + Details: + - Integrated a patch originally authored and submitted by Ricardo Magana + of HP Enterprise. The changeset inserts use of a new object type, membrk_t, + (memory broker) that allows multiple sets of memory pools on, for example, + separate NUMA nodes, each of which has a separate memory space. + - Added membrk field to cntx_t and defined corresponding accessor macros. + - Added membrk field to mem_t object and defined corresponding accessor macros. + - Created new bli_membrk.c file, which contains the new memory broker API, + including: + bli_membrk_init(), bli_membrk_finalize() + bli_membrk_acquire_[mv](), bli_membrk_release(), + bli_membrk_init_pools(), bli_membrk_reinit_pools(), + bli_membrk_finalize_pools(), + bli_membrk_pool_size() + - In bli_mem.c, changed function calls to + bli_mem_init_pools() -> bli_membrk_init() + bli_mem_reinit_pools() -> bli_membrk_reinit() + bli_mem_finalize_pools() -> bli_membrk_finalize() + - In bli_packv_init.c, bli_packm_init.c, changed function calls to: + bli_mem_acquire_[mv]() -> bli_membrk_acquire_[mv]() + bli_mem_release() -> bli_membrk_release() + - Added bli_mutex.c and related files to frame/thread. These files define + abstract mutexes (locks) and corresponding APIs for pthreads, openmp, or + single-threaded execution. This new API is employed within functions + such as bli_membrk_acquire_[mv]() and bli_membrk_release(). + +commit ce59f81108ec9aea918a7e77030da8acfdd397ce +Merge: ff41153 707a2b7 +Author: Field G. Van Zee +Date: Fri Jul 22 14:48:14 2016 -0500 + + Merge pull request #88 from devinamatthews/32bit-dim_t + + Handle 32-bit dim_t in 64-bit microkernels. + +commit 707a2b7faca137cca7cab7b11a12c44ddaf7ad53 +Author: Devin Matthews +Date: Fri Jul 22 13:49:44 2016 -0500 + + Somehow forgot the most important microkernel. + +commit 47ec045056351ac4f0791c071fa0daaa81699c8c +Merge: 08f1d6b ff41153 +Author: Devin Matthews +Date: Fri Jul 22 13:45:23 2016 -0500 + + Merge remote-tracking branch 'upstream/master' into 32bit-dim_t + +commit 08f1d6b6fa344275de0f675f69737145ccf6646a +Author: Devin Matthews +Date: Fri Jul 22 13:44:37 2016 -0500 + + Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. + +commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 +Merge: f9214ce e0d2fa0 +Author: Field G. Van Zee +Date: Fri Jul 22 13:21:03 2016 -0500 + + Merge pull request #86 from devinamatthews/haswell-vmovups + + Remove alignment restrictions on C in haswell kernel. + +commit e0d2fa0d835ab49366aeb790363bb2b571d36ed8 +Author: Devin Matthews +Date: Fri Jul 22 12:56:51 2016 -0500 + + Relax alignment restrictions for haswell sgemm. + +commit f9214ced97392861f5a0ea72abfcf6f41faf674c +Merge: 413d62a 08666ea +Author: Field G. Van Zee +Date: Fri Jul 22 12:16:39 2016 -0500 + + Merge pull request #85 from devinamatthews/qopenmp + + Change -openmp to -fopenmp for icc. + +commit ee2c139df6ad53c6aec8a67ab23b3b1912e8d259 +Author: Devin Matthews +Date: Fri Jul 22 12:06:03 2016 -0500 + + Remove alignment restrictions on C in haswell kernel. + +commit 08666eaa20d8a31f2f92f944e5bfa7c1558c53e4 +Author: Devin Matthews +Date: Fri Jul 22 11:07:34 2016 -0500 + + Change -openmp to -fopenmp for icc. + +commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 +Merge: 31def12 413d62a +Author: Field G. Van Zee +Date: Thu Jul 14 11:01:06 2016 -0500 + + Merge branch 'master' into compose + +commit 413d62aca28edabba56605a9f87d5b715831e1db +Author: Field G. Van Zee +Date: Tue Jul 12 15:02:52 2016 -0500 + + README update (use official ACM TOMS links). + +commit dfa431f696db2df4065ea454df268a2e0bc02eac +Author: Field G. Van Zee +Date: Tue Jul 12 14:21:19 2016 -0500 + + README update (BLIS2 TOMS article now in-print). + +commit 31def12e2629f187e40f93f6bae9e26a6c2660e2 +Author: Field G. Van Zee +Date: Thu Jun 30 15:19:20 2016 -0500 + + First phase of control tree redesign. + + Details: + - These changes constitute the first set of changes in preparation to + revamping the structure and use of control trees in BLIS. Modifications + in this commit don't affect the control tree code yet, but rather lay + the groundwork. + - Defined wrappers for the following functions, where the the wrappers + each take a direction parameter of a new enumerated type (BLIS_BWD or + BLIS_FWD), dir_t, and executes the correct underlying function. + - bli_acquire_mpart_*() and _vpart_*() + - bli_*_determine_kc_[fb]() + - bli_thread_get_range_*() and bli_thread_get_range_weighted_*() + - Consolidated all 'f' (forwards-moving) and 'b' (backwards-moving) + blocked variants for trmm and trsm, and renamed gemm and herk variants + accordingly. The direction is now queried via routines such as + bli_trmm_direct(), which deterines the direction from the implied side + and uplo parameters. For gemm and herk, it is uncondtionally BLIS_FWD. + - Defined wrappers to parameter-specific macrokernels for herk, trmm, and + trsm, e.g. bli_trmm_xx_ker_var2(), that execute the correct underlying + macrokernel based on the implied parameters. The same logic used to + choose the dir_t in _direct() functions is used here. + - Simplified the function pointer arrays in _int() functions given the + consolidation and dir_t querying mentioned above. + - Function signature (whitespace) reformatting for various functions. + - Removed old code in various 'old' directories. + +commit 232754feecf29452987666b9f5ebba2619bfd0b0 +Author: Field G. Van Zee +Date: Tue Jun 21 14:25:39 2016 -0500 + + Fixed compiler warning in rand[vm], randn[vm]. + + Details: + - Fixed compiler warnings about unused variables related to the disabling + of normalization in the structured cases of the rand[vm] and randn[vm] + operations. + +commit a89555d1605574f3685813dcc972b636dd61264d +Author: Field G. Van Zee +Date: Fri Jun 17 14:08:35 2016 -0500 + + Added randn[vm] operations, support in testsuite. + + Details: + - Defined a new randomization operation, randn, on vectors and matrices. + The randnv and randnm operations randomize each element of the target + object with values from a narrow range of values. Presently, those + values are all integer powers of two, but they do not need to be powers + of two in order to achieve the primary goal, which is to initialize + objects that can be operated on with plenty of precision "slack" + available to allow computations that avoid roundoff. Using this method + of randomization makes it much more likely that testsuite residuals of + properly-functioning operations are close to zero, if not exactly zero. + - Updated existing randomization operations randv and randm to skip + special diagonal handling and normalization for matrices with structure. + This is now handled by the testsuite modules by explicitly calling a + testsuite function that loads the diagonal (and scales off-diagonal + elements). + - Added support for randnv and randnm in the testsuite with a new switch + in input.general that universally toggles between use of the classic + randv/randm, which use real values on the interval [-1,1], and + randnv/randnm, which use only values from a narrow range. Currently, + the narrow range is: +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6}, as + well as 0.0. + - Updated testsuite modules so that a testsutie wrapper function is called + instead of directly calling the randomization operations (such as + bli_randv() and bli_randm()). This wrapper also takes a bool_t that + indicates whether the object's elements should be normalized. (NOTE: As + alluded to above, in the test modules of triangular solve operations such + as trsv and trsm, we perform the extra step of loading the diagonal.) + - Defined a new level-0 operation, invertsc, which inverts a scalar. + - Updated the abval2ris and sqrt2ris level-0 macros to avoid an unlikely + but possible divide-by-zero. + - Updated function signature and prototype formatting in testsuite. + +commit 096895c5d538a7f8817603d7cf28c52e99340def +Author: Field G. Van Zee +Date: Mon Jun 6 13:32:04 2016 -0500 + + Reorganized code, APIs related to multithreading. + + Details: + - Reorganized code and renamed files defining APIs related to multithreading. + All code that is not specific to a particular operation is now located in a + new directory: frame/thread. Code is now organized, roughly, by the + namespace to which it belongs (see below). + - Consolidated all operation-specific *_thrinfo_t object types into a single + thrinfo_t object type. Operation-specific level-3 *_thrinfo_t APIs were + also consolidated, leaving bli_l3_thrinfo_*() and bli_packm_thrinfo_*() + functions (aside from a few general purpose bli_thrinfo_*() functions). + - Renamed thread_comm_t object type to thrcomm_t. + - Renamed many of the routines and functions (and macros) for multithreading. + We now have the following API namespaces: + - bli_thrinfo_*(): functions related to thrinfo_t objects + - bli_thrcomm_*(): functions related to thrcomm_t objects. + - bli_thread_*(): general-purpose functions, such as initialization, + finalization, and computing ranges. (For now, some macros, such as + bli_thread_[io]broadcast() and bli_thread_[io]barrier() use the + bli_thread_ namespace prefix, even though bli_thrinfo_ may be more + appropriate.) + - Renamed thread-related macros so that they use a bli_ prefix. + - Renamed control tree-related macros so that they use a bli_ prefix (to be + consistent with the thread-related macros that were also renamed). + - Removed #undef BLIS_SIMD_ALIGN_SIZE from dunnington's bli_kernel.h. This + #undef was a temporary fix to some macro defaults which were being applied + in the wrong order, which was recently fixed. + +commit 232530e88ff99f37abcae5b6fb5319a9a375a45f +Merge: 4bcabd1 eef37f8 +Author: Tyler Michael Smith +Date: Wed Jun 1 15:14:10 2016 -0500 + + Merge commit 'refs/pull/81/head' of https://github.com/flame/blis + + Conflicts: + frame/base/bli_threading_pthreads.c + frame/base/bli_threading_pthreads.h + +commit 4bcabd1bf60688c38cf562459fc5e8be8b831756 +Author: Tyler Michael Smith +Date: Wed Jun 1 13:27:28 2016 -0500 + + Use spin locks instead of pthread barriers + +commit eef37f8b4d81845a6ba4bf25586d32b50c3e8a68 +Author: Jeff Hammond +Date: Sun May 29 22:28:13 2016 -0700 + + use GCC intrinsic instead of pthread_mutex for atomic increment and fetch + +commit 9dcd6f05c4c3ff2ce7cd87a9951a96ebef22681e +Author: Field G. Van Zee +Date: Tue May 24 13:15:32 2016 -0500 + + Implemented developer-configurable malloc()/free(). + + Details: + - Replaced all instances of bli_malloc() and bli_free() with one of: + - bli_malloc_pool()/bli_free_pool() + - bli_malloc_user()/bli_free_user() + - bli_malloc_intl()/bli_free_intl() + each of which can be configured to call malloc()/free() substitutes, + so long as the substitute functions have the same function type + signatures as malloc() and free() defined by C's stdlib.h. The _pool() + function is called when allocating blocks for the memory pools (used + for packing buffers, primarily), the _user() function is called when + obj_t's are created (via bli_obj_create() and friends), and the _intl() + function is called for internal use by BLIS, such as when creating + control tree nodes or temporary buffers for manipulating internal data + structures. Substitutes for any of the three types of bli_malloc() may + be specified by #defining the following pairs of cpp macros in + bli_kernel.h: + - BLIS_MALLOC_POOL/BLIS_FREE_POOL + - BLIS_MALLOC_USER/BLIS_FREE_USER + - BLIS_MALLOC_INTL/BLIS_FREE_INTL + to be the name of the substitute functions. (Obviously, the object + code that contains these functions must be provided at link-time.) + These macros default to malloc() and free(). Subsitute functions are + also automatically prototyped by BLIS (in bli_malloc_prototypes.h). + - Removed definitions for bli_malloc() and bli_free(). + - Note that bli_malloc_pool() and bli_malloc_user() are now defined in + terms of a new function, bli_malloc_align(), which aligns memory to an + arbitrary (power of two) alignment boundary, but does so manually, + whereas before alignment was performed behind the scenes by + posix_memalign(). Currently, bli_malloc_intl() is defined in terms + of bli_malloc_noalign(), which serves as a simple wrapper to the + designated function that is passed in (e.g. BLIS_MALLOC_INTL). + Similarly, there are bli_free_align() and bli_free_noalign(), which + are used in concert with their bli_malloc_*() counterparts. + +commit 9dd440109a9d964f5cd286e9f83c487ad703e1e4 +Author: Jeff Hammond +Date: Sat May 21 15:21:58 2016 -0700 + + fix 404 link to BuildSystem + + Google Code is dead. Long live GitHub! + +commit d309f20b7376a68efa3b864ad790c2021c071655 +Author: Field G. Van Zee +Date: Wed May 18 15:13:53 2016 -0500 + + Added alignment switch to testsuite. + + Details: + - Added a new input parameter to input.general that globally toggles + whether testsuite tests are performed on objects whose buffers and + leading dimensions have been aligned, and changed the implementation + of libblis_test_mobj_create() to employ alignment (or not) regardless + of whether row, column, or general storage is being tested. + - Updated configure script's "--help" text to indicate default behavior + for internal integer type size and BLAS/CBLAS integer type size + options. + +commit 32db0adc218ea4ae370164dbe8d23b41cd3526d3 +Author: Field G. Van Zee +Date: Tue May 17 15:20:16 2016 -0500 + + Generate prototypes for user-defined packm kernels. + + Details: + - Created template prototypes for packm kernels (in bli_l1m_ker.h), and + then redefined reference packm kernels' prototyping headers in terms of + this template, as is already done for level-1v, -1f, and -3 kernels. + - Automatically generate prototypes for user-defined packm kernels in + bli_kernel_prototypes.h (using the new template prototypes in + bli_l1m_ker.h). + - Defined packm kernel function types in bli_l1m_ft.h, including for + packm kernels specific to induced methods, which are now used in + bli_packm_cxk.c and friends rather than using a locally-defined + function type. + - In bli_packm_cxk.c, extended function pointer for packm kernels array + from out to index 31 (from previous maximum of 17). This allows us to + store the unrolled 30xk kernel in the array for use (on knc, for + example). Note: This should have been done a long time ago. + +commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 +Author: Field G. Van Zee +Date: Wed May 11 16:09:49 2016 -0500 + + Fixed bli_get_range_*() bugs in trsm variants. + + Details: + - Fixed incorrect calls to bli_get_range_*() from within trsm blocked + variants 1f, 2b, and 2f. The bug somehow went undetected since the + big commit (537a1f4), and, strangely, did not manifest via the BLIS + testsuite. The bug finally came to our attention when running thei + libflame test suite while linking to BLIS. Thanks to Kiran Varaganti + for submitting the initial report that led to this bug. + +commit 9cfa33023f123a6c17e987f72fba174ce073f0b6 +Author: Field G. Van Zee +Date: Wed May 11 16:02:30 2016 -0500 + + Minor updates to bli_f2c.h. + + Details: + - Added #undef guards to certain #define statements in bli_f2c.h, + and renamed the file guard to BLIS_F2C_H. This helps when + #including "blis.h" from an application or library that already + #includes an "f2c.h" header. + +commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 +Merge: 4dcd37e 7c604e1 +Author: Tyler Michael Smith +Date: Wed May 11 10:47:11 2016 -0500 + + Merge pull request #76 from devinamatthews/move_simd_defs + + Move default SIMD-related definitions to bli_kernel_macro_defs.h + +commit 4dcd37eb1b12a6e08cc13df7b61391ef8363f5d8 +Author: Tyler Smith +Date: Tue May 10 16:28:59 2016 -0500 + + fixing knc simd align size + +commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 +Author: Devin Matthews +Date: Tue May 10 12:11:55 2016 -0500 + + Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. + +commit a7be2d28e8930b154d0da1d6929b54a96e210af6 +Merge: 97b512e 4b1e55e +Author: Field G. Van Zee +Date: Tue May 10 11:48:51 2016 -0500 + + Merge pull request #74 from devinamatthews/fix_common_symbols + + Default-initialize all extern global variables to avoid generating common symbols. + +commit 4b1e55edbfe0e1cb2e7b9428424903497cb7a841 +Author: Devin Matthews +Date: Tue May 10 10:08:47 2016 -0500 + + Default-initialize all extern global variables to avoid generating common symbols. Fixes #73. + +commit 97b512ef62c7e25c97ed5e9eca81cd7015b2ac91 +Author: Field G. Van Zee +Date: Fri May 6 10:24:30 2016 -0500 + + Include headers from cblas.h to pull in f77_int. + + Details: + - Added #include statements for certain key BLIS headers so that the + definition of f77_int is pulled in when a user compiles application + code with only #include "cblas.h" (and no other BLIS header). This + is necessary since f77_int is now used within the cblas API. + +commit c3a4d39d03665135f1616588b5ef7c3e9ef5688d +Author: Field G. Van Zee +Date: Wed May 4 17:22:56 2016 -0500 + + Updates to haswell gemm micro-kernels. + + Details: + - Added two new sets of [sd]gemm micro-kernels for haswell architectures, + one that is 4x24/4x12 (s and d) and one that is 6x16/6x8. + - Changed the haswell configuration to use the 6x16/6x8 micro-kernels + by default. + - Updated various Makefiles, in test, test/3m4m, and testsuite. + +commit 0b01d355ae861754ae2da6c9a545474af010f02e +Author: Field G. Van Zee +Date: Wed Apr 27 15:21:10 2016 -0500 + + Miscellaneous cleanups, fixes to recent commits. + + Details: + - Fixed a typo in bli_l1f_ref.h, introduced into bbb8569, that only + manifested when non-reference level-1f kernels were used. + - Added an #undef BLIS_SIMD_ALIGN_SIZE to bli_kernel.h of dunnington + configuration to prevent a compile-time warning until I can figure out + the proper permanent fix. + - Moved frame/1f/kernels/bli_dotxaxpyf_ref_var1.c out of the compilation + path (into 'other' directory). _ref_var2 is used by default, which is + the variant that is built on axpyf and dotxf instead of dotaxpyv. + - Removed section of frame/include/bli_config_macro_defs.h pertaining to + mixed datatype support. + +commit ed7326c836f427e2f8420b015220ce293207b10c +Author: Field G. Van Zee +Date: Wed Apr 27 14:57:40 2016 -0500 + + Added 'restrict' to l1v/l1f code in 'kernels' dir. + + Details: + - Added 'restrict' keyword to existing kernel definitions in 'kernels' + directory. These changes were meant for inclusion in bbb8569. + +commit bbb8569b2a08c3bcd631d5a05eb389d01d94ac07 +Author: Field G. Van Zee +Date: Wed Apr 27 14:13:46 2016 -0500 + + Use 'restrict' in all kernel APIs; wspace changes. + + Details: + - Updated level-1v, level-1f kernel function types (bli_l1?_ft.h) and + generic kernel prototypes (bli_l1?_ker.h) to use 'restrict' for all + numerical operand pointers (ie: all pointers except the cntx_t). + - Updated level-1f reference kernel definitions to use 'restrict' for + all numerical operand pointers. (Level-1v reference kernel definitions + were already updated in bdbda6e.) + - Rewrote the level-1v and level-1f reference kernel prototypes in + bli_l1v_ref.h and bli_l1f_ref.h, respectively, to simply #include + bli_l1v_ker.h and bli_l1f_ker.h with redefined function base names + (as was already being done for the level-3 micro-kernel prototypes + in bli_l3_ref.h), rather than duplicate the signatures from the + _ker.h files. + - Added definitions to frame/include/bli_kernel_prototypes.h for axpbyv + and xpbyv, which were probably meant for inclusion in bdbda6e. + - Converted a number of instances of four spaces, as introduced in + bdbda6e, to tabs. + +commit 4ea419c72c789825e1f93a1eee88219bbf873930 +Merge: f1e9be2 bdbda6e +Author: Field G. Van Zee +Date: Tue Apr 26 12:50:45 2016 -0500 + + Merge pull request #70 from devinamatthews/daxpby + + Give the level1v operations some love + +commit bdbda6e6acc682ab1b6ca680edebd09ae12a832c +Author: Devin Matthews +Date: Mon Apr 25 11:05:57 2016 -0500 + + Give the level1v operations some love: + + - Add missing axpby and xpby operations (plus test cases). + - Add special case for scal2v with alpha=1. + - Add restrict qualifiers. + - Add special-case algorithms for incx=incy=1. + +commit f1e9be2aba1a057eedb947bbae96848597777408 +Author: Field G. Van Zee +Date: Fri Apr 22 15:34:02 2016 -0500 + + Minor tweak to test/Makefile. + + Details: + - Just committing a minor change to test/Makefile that has been lingering + in my local working copy for longer than I can remember. + +commit aa0bceec277938328dabeb744680623f24fb0b61 +Merge: 4136553 e2784b4 +Author: Field G. Van Zee +Date: Fri Apr 22 12:01:31 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4136553f0d0661a668dfdb9edcd7ce1c5773dde7 +Author: Field G. Van Zee +Date: Fri Apr 22 11:53:53 2016 -0500 + + Clear level-3 cntx_t's via memset() before use. + + Details: + - In all level-3 operations' _cntx_init() functions, replaced calls to + bli_cntx_obj_init() with calls to bli_cntx_obj_clear(), and in all + level-3 operations' _cntx_finalize() functions, removed calls to + bli_cntx_obj_finalize(), leaving those function definitions empty. + - Changed the definition of bli_cntx_obj_clear() so that the clearing + occurs via a single call to memset(). + +commit e2784b4c921f706e756df3e146e20a4cb63f53e3 +Merge: dd0ab1d a9b6c3a +Author: Field G. Van Zee +Date: Wed Apr 20 18:34:09 2016 -0500 + + Merge pull request #67 from devinamatthews/cblas-f77-int + + Change CBLAS integer type to f77_int + +commit a9b6c3abda6222a8b240361643932e83cf726c4f +Merge: e4c54c8 dd0ab1d +Author: Devin Matthews +Date: Wed Apr 20 16:00:10 2016 -0500 + + Merge remote-tracking branch 'origin/master' into cblas-f77-int + + # Conflicts: + # config/haswell/bli_config.h + +commit e4c54c81463c2a19c9bb6b1f0f1be3fa9d018a45 +Author: Devin Matthews +Date: Wed Apr 20 15:56:46 2016 -0500 + + Change integer type in CBLAS function signatures to f77_int, and add proper const-correctness to BLAS layer. + +commit dd0ab1d93f33abca6af9edd7b8e52da62dcfa5b1 +Author: Field G. Van Zee +Date: Wed Apr 20 14:38:23 2016 -0500 + + Converted some bli_cntx query functions to macros. + + Details: + - Commented out several datatype-aware query functions (those ending in + _dt) from bli_cntx.c, as well as their prototypes in bli_cntx.h, and + added equivalent cpp query macros to bli_cntx.h. + - Added 'bli_config.h' to .gitignore. + +commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb +Merge: eb2f18e 0e1a982 +Author: Field G. Van Zee +Date: Tue Apr 19 15:04:33 2016 -0500 + + Merge pull request #66 from devinamatthews/blas-configure + + Add configure options and generate bli_config.h automatically. + +commit eb2f18e4844d985715df20798f50f9cc12e3b5ad +Author: Field G. Van Zee +Date: Tue Apr 19 12:50:32 2016 -0500 + + More compile-time fixes to bgq gemm ukernel code. + +commit 0e1a9821d860f6c1d818baf4c48d21a23726c132 +Author: Devin Matthews +Date: Tue Apr 19 11:44:37 2016 -0500 + + Add configure options and generate bli_config.h automatically. + + Options to configure have been added for: + - Setting the internal BLIS and BLAS/CBLAS integer sizes. + - Enabling and disabling the BLAS and CBLAS layers. + + Additionally, configure options which require defining macros (the above plus the threading model), write their macros to the automatically-generated bli_config.h file in the top-level build directory. The old bli_config.h files in the config dirs were removed, and any kernel-related macros (SIMD size and alignment etc.) were moved to bli_kernel.h. The Makefiles were also modified to find the new bli_config.h file. + + Lastly, support for OMP in clang has been added (closes #56). + +commit ff84469a4575f1ef8a0010046fde52240a312cae +Author: Field G. Van Zee +Date: Mon Apr 18 12:29:09 2016 -0500 + + Applied various compilation fixes to bgq kernels. + +commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f +Author: Tyler Michael Smith +Date: Mon Apr 18 03:12:57 2016 -0500 + + Changing ifdef for OSX pthread barriers + +commit dd62080cea78f3a23616200d6640e52c102b2bb9 +Author: Field G. Van Zee +Date: Fri Apr 15 11:15:41 2016 -0500 + + Compile-time fix to bgq l1f kernels. + + Details: + - Fixed an old reference to bli_daxpyf_fusefac, which no longer exists, + by replacing it with the axpyf fusing factor (8), and cleaned up the + relevant section of config/bgq/bli_kernel.h. + - Removed most of the details of the level-3 kernels from the template + kernel code in config/template/kernels/3 and replaced it with a + reference to the relevant kernel wiki maintained on the BLIS github + website. + +commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a +Merge: 4320b72 4169467 +Author: Field G. Van Zee +Date: Thu Apr 14 12:56:36 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4320b725a1f8fd34101470b6cf52ad504a79c517 +Author: Field G. Van Zee +Date: Thu Apr 14 12:51:29 2016 -0500 + + Use kernel CFLAGS on "ukernels" directories. + + Details: + - Updated the top-level Makefile so that the CFLAGS variable designated + for kernel source code is applied not only to source code in + directories named "kernels" but source code in any directory that + contains the substring "kernels", such as "ukernels". + - Formally disabled some code in gen-make-frag.sh script that was already + effectively disabled. The code was related to handling "noopt" and + "kernel" directories, which is now handled independently within the + top-level Makefile without needing to place these source files into + a spearate makefile variable. + +commit 41694675e4cb56e2e0323c7a7db48e0819606a31 +Author: Tyler Smith +Date: Wed Apr 13 15:51:08 2016 -0500 + + pthreads bugfixes + + Getting pthreads to work on my Mac + Implemented a pthread barrier when _POSIX_BARRIER isn't defined + Now spawn n-1 threads instead of n threads so that master thread isn't just spinning the whole time + Add -lpthread instead of -pthread to LDFLAGS (for clang) + +commit f756dbfa0d542cbc497724981520c83abf049c4b +Author: Field G. Van Zee +Date: Wed Apr 13 11:25:33 2016 -0500 + + Removed stale #include from bgq configuration. + + Details: + - Removed an old #include statement ("bli_gemm_8x8.h") from the + bli_kernel.h file in the bgq configuration. It turns out this + file was no longer needed even prior to 537a1f4. + +commit 0bd4169ea75f690714e7d2912229932a75d8a7e2 +Author: Field G. Van Zee +Date: Mon Apr 11 18:08:32 2016 -0500 + + Fixed context-broken dunnington/penryn kernels. + + Details: + - Added missing context parameters to several instances where simpler + kernels, or reference kernels, are called instead of executing the + main body code contained in the kernel function in question. + - Renamed axpyv and dotv kernel files to use "opt" instead of "int" + substring, for consistency with level-1f kernels. + +commit 7912af5db45b7372d19a9a3dfeb82df302a05628 +Author: Field G. Van Zee +Date: Mon Apr 11 17:32:13 2016 -0500 + + CHANGELOG update (0.2.0) + +commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (tag: 0.2.0) Author: Field G. Van Zee Date: Mon Apr 11 17:32:09 2016 -0500 @@ -132,7 +1182,7 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. -commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 (origin/master) +commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 Merge: 20af937 c11d28e Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -2384,8 +3434,8 @@ Date: Wed Aug 20 14:44:51 2014 -0500 Merge branch 'master' of http://github.com/flame/blis Conflicts: - frame/3/trsm/bli_trsm_blk_var2b.c - frame/3/trsm/bli_trsm_blk_var2f.c + frame/3/trsm/bli_trsm_blk_var2b.c + frame/3/trsm/bli_trsm_blk_var2f.c commit 699a8151ca3d5021e834a1784ef45dcc3a3d17cd Author: Tyler Smith @@ -3492,8 +4542,8 @@ Date: Fri Apr 4 09:54:54 2014 -0500 Merge http://github.com/flame/blis Conflicts: - kernels/bgq/1/bli_axpyv_opt_var1.c - kernels/bgq/1/bli_dotv_opt_var1.c + kernels/bgq/1/bli_axpyv_opt_var1.c + kernels/bgq/1/bli_dotv_opt_var1.c commit 4e3eb39aca4df0b9fdc003d468f368a2f2ba597d Author: Tyler Michael Smith @@ -3793,7 +4843,7 @@ Date: Thu Feb 27 16:46:23 2014 -0600 Merge https://github.com/flame/blis Conflicts: - frame/1m/packm/bli_packm_blk_var1.c + frame/1m/packm/bli_packm_blk_var1.c commit e8757b03a74f9891632242e9a90efb32150826f5 Author: Field G. Van Zee diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index 1c1265661..2287a7222 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -34,12 +34,11 @@ #include "blis.h" +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -51,7 +50,6 @@ thrinfo_t* bli_packm_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, FALSE, @@ -60,14 +58,13 @@ thrinfo_t* bli_packm_thrinfo_create return thread; } +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -77,7 +74,6 @@ void bli_packm_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, FALSE, sub_node @@ -93,13 +89,13 @@ void bli_packm_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, 0, NULL ); } +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread @@ -109,4 +105,4 @@ void bli_packm_thrinfo_free thread != &BLIS_PACKM_SINGLE_THREADED ) bli_free_intl( thread ); } - +#endif diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 7b6d7ae4d..5da496f96 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -42,24 +42,22 @@ // thrinfo_t APIs specific to packm. // +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -70,8 +68,10 @@ void bli_packm_thrinfo_init_single thrinfo_t* thread ); +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); +#endif diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 36b65b52b..78b2b775c 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -35,12 +35,11 @@ #include "blis.h" #include "assert.h" +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -49,21 +48,19 @@ thrinfo_t* bli_l3_thrinfo_create return bli_thrinfo_create ( ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, TRUE, sub_node ); } +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -73,7 +70,6 @@ void bli_l3_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, TRUE, @@ -105,14 +101,12 @@ void bli_l3_thrinfo_free // is marked as needing them to be freed. The most common example of // thrinfo_t nodes NOT marked as needing their comms freed are those // associated with packm thrinfo_t nodes. - if ( bli_thrinfo_needs_free_comms( thread ) ) + if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator, and the ichief free its // communicator if we are at the leaf node. if ( bli_thread_am_ochief( thread ) ) bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); - if ( thrinfo_sub_node == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( bli_thrinfo_icomm( thread ) ); } // Free all children of the current thrinfo_t. @@ -124,117 +118,208 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- -//#define PRINT_THRINFO - -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { - dim_t jc_in, jc_way; - dim_t kc_in, kc_way; - dim_t ic_in, ic_way; - dim_t jr_in, jr_way; - dim_t ir_in, ir_way; + // Query the global communicator for the total number of threads to use. + dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); -#ifdef BLIS_ENABLE_MULTITHREADING - jc_in = bli_env_read_nway( "BLIS_JC_NT" ); - //kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - kc_in = 1; - ic_in = bli_env_read_nway( "BLIS_IC_NT" ); - jr_in = bli_env_read_nway( "BLIS_JR_NT" ); - ir_in = bli_env_read_nway( "BLIS_IR_NT" ); -#else - jc_in = 1; - kc_in = 1; - ic_in = 1; - jr_in = 1; - ir_in = 1; -#endif + // Use the thread id passed in as the global communicator id. + dim_t gl_comm_id = id; - if ( l3_op == BLIS_TRMM ) - { - // We reconfigure the parallelism for trmm_r due to a dependency in - // the jc loop. (NOTE: This dependency does not exist for trmm3.) - if ( bli_is_right( side ) ) - { - jc_way = 1; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in * jc_in; - ir_way = ir_in; - } - else // if ( bli_is_left( side ) ) - { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; - } - } - else if ( l3_op == BLIS_TRSM ) - { - if ( bli_is_right( side ) ) - { + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); - jc_way = 1; - kc_way = 1; - ic_way = jc_in * ic_in * jr_in; - jr_way = 1; - ir_way = 1; - } - else // if ( bli_is_left( side ) ) - { - jc_way = 1; - kc_way = 1; - ic_way = 1; - jr_way = ic_in * jr_in * ir_in; - ir_way = 1; - } - } - else // all other level-3 operations + // Determine the work id for this thrinfo_t node. + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + // Create the root thrinfo_t node. + *thread = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); +} + +// ----------------------------------------------------------------------------- + +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ) +{ + dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t gl_comm_id; + + thrinfo_t* jc_info = threads[0]; + thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); + thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); + thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); + thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); + thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); + thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t jc_way = bli_thread_n_way( jc_info ); + dim_t pc_way = bli_thread_n_way( pc_info ); + dim_t pb_way = bli_thread_n_way( pb_info ); + dim_t ic_way = bli_thread_n_way( ic_info ); + dim_t pa_way = bli_thread_n_way( pa_info ); + dim_t jr_way = bli_thread_n_way( jr_info ); + dim_t ir_way = bli_thread_n_way( ir_info ); + + dim_t gl_nt = bli_thread_num_threads( jc_info ); + dim_t jc_nt = bli_thread_num_threads( pc_info ); + dim_t pc_nt = bli_thread_num_threads( pb_info ); + dim_t pb_nt = bli_thread_num_threads( ic_info ); + dim_t ic_nt = bli_thread_num_threads( pa_info ); + dim_t pa_nt = bli_thread_num_threads( jr_info ); + dim_t jr_nt = bli_thread_num_threads( ir_info ); + + printf( " gl jc kc pb ic pa jr ir\n" ); + printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + gl_nt, jc_nt, pc_nt, pb_nt, ic_nt, pa_nt, jr_nt, (dim_t)1 ); + printf( "\n" ); + printf( " jc kc pb ic pa jr ir\n" ); + printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + jc_way, pc_way, pb_way, ic_way, pa_way, jr_way, ir_way ); + printf( "=================================================\n" ); + + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; + jc_info = threads[gl_comm_id]; + pc_info = bli_thrinfo_sub_node( jc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); + dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); + dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); + dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); + dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); + dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); + dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + + dim_t jc_work_id = bli_thread_work_id( jc_info ); + dim_t pc_work_id = bli_thread_work_id( pc_info ); + dim_t pb_work_id = bli_thread_work_id( pb_info ); + dim_t ic_work_id = bli_thread_work_id( ic_info ); + dim_t pa_work_id = bli_thread_work_id( pa_info ); + dim_t jr_work_id = bli_thread_work_id( jr_info ); + dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "---------------------------------------\n" ); } +} - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); +// ----------------------------------------------------------------------------- - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ) +{ + // Query the context for the total number of threads to use. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Create a global thread communicator for all the threads. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // Allocate an array of thrinfo_t pointers, one for each thread. + thrinfo_t** paths = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); + + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); + + dim_t gl_comm_id; + + // Create one thrinfo_t node for each thread in the (global) communicator. + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) + { + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + paths[ gl_comm_id ] = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); + } + + return paths; +} + +//#define PRINT_THRINFO + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx + ) +{ + dim_t jc_way = bli_cntx_jc_way( cntx ); + dim_t pc_way = bli_cntx_pc_way( cntx ); + dim_t ic_way = bli_cntx_ic_way( cntx ); + dim_t jr_way = bli_cntx_jr_way( cntx ); + dim_t ir_way = bli_cntx_ir_way( cntx ); + + dim_t gl_nt = jc_way * pc_way * ic_way * jr_way * ir_way; + dim_t jc_nt = pc_way * ic_way * jr_way * ir_way; + dim_t pc_nt = ic_way * jr_way * ir_way; dim_t ic_nt = jr_way * ir_way; dim_t jr_nt = ir_way; dim_t ir_nt = 1; + assert( gl_nt != 0 ); + #ifdef PRINT_THRINFO -printf( " jc kc ic jr ir\n" ); -printf( "xx_way: %4lu %4lu %4lu %4lu %4lu\n", - jc_way, kc_way, ic_way, jr_way, ir_way ); +printf( " gl jc kc pb ic pa jr ir\n" ); +printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_nt, jc_nt, pc_nt, pc_nt, ic_nt, ic_nt, jr_nt, ir_nt ); printf( "\n" ); -printf( " gl jc kc ic jr ir\n" ); -printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu\n", -global_num_threads, jc_nt, kc_nt, ic_nt, jr_nt, ir_nt ); -printf( "=======================================\n" ); +printf( " jc kc pb ic pa jr ir\n" ); +printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +jc_way, pc_way, (dim_t)0, ic_way, (dim_t)0, jr_way, ir_way ); +printf( "=================================================\n" ); #endif - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); + thrinfo_t** paths = bli_malloc_intl( gl_nt * sizeof( thrinfo_t* ) ); - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); + thrcomm_t* gl_comm = bli_thrcomm_create( gl_nt ); for( int a = 0; a < jc_way; a++ ) { thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) + for( int b = 0; b < pc_way; b++ ) { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); + thrcomm_t* pc_comm = bli_thrcomm_create( pc_nt ); for( int c = 0; c < ic_way; c++ ) { @@ -246,73 +331,83 @@ printf( "=======================================\n" ); for( int e = 0; e < ir_way; e++ ) { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; + //thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t pc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*pc_nt + pc_comm_id; + dim_t gl_comm_id = a*jc_nt + jc_comm_id; // macro-kernel loops thrinfo_t* ir_info = bli_l3_thrinfo_create( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, ir_way, e, NULL ); thrinfo_t* jr_info = bli_l3_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, jr_way, d, ir_info ); // packa - thrinfo_t* pack_ic_in + thrinfo_t* pa_info = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, ic_nt, ic_comm_id, jr_info ); // blk_var1 thrinfo_t* ic_info = - bli_l3_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, + bli_l3_thrinfo_create( pc_comm, pc_comm_id, ic_way, c, - pack_ic_in ); + pa_info ); // packb - thrinfo_t* pack_kc_in + thrinfo_t* pb_info = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id, + bli_packm_thrinfo_create( pc_comm, pc_comm_id, + pc_nt, pc_comm_id, ic_info ); // blk_var3 - thrinfo_t* kc_info + thrinfo_t* pc_info = bli_l3_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_in ); + pc_way, b, + pb_info ); // blk_var2 thrinfo_t* jc_info = - bli_l3_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, + bli_l3_thrinfo_create( gl_comm, gl_comm_id, jc_way, a, - kc_info ); + pc_info ); - paths[global_comm_id] = jc_info; + paths[gl_comm_id] = jc_info; #ifdef PRINT_THRINFO -printf( " gl jc kc ic jr ir\n" ); -printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu\n", -global_comm_id, jc_comm_id, kc_comm_id, ic_comm_id, jr_comm_id, ir_comm_id ); -//printf( " a b c d e\n" ); -printf( "work ids: %4ld %4ld %4ld %4ld %4ld\n", (long int)a, (long int)b, (long int)c, (long int)d, (long int)e ); -printf( "---------------------------------------\n" ); +{ +dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); +dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); +dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); +dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); +dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); +dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); +dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + +dim_t jc_work_id = bli_thread_work_id( jc_info ); +dim_t pc_work_id = bli_thread_work_id( pc_info ); +dim_t pb_work_id = bli_thread_work_id( pb_info ); +dim_t ic_work_id = bli_thread_work_id( ic_info ); +dim_t pa_work_id = bli_thread_work_id( pa_info ); +dim_t jr_work_id = bli_thread_work_id( jr_info ); +dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "-------------------------------------------------\n" ); +} #endif } @@ -330,15 +425,16 @@ exit(1); void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ) { + dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t i; - for ( i = 0; i < num; ++i ) + for ( i = 0; i < n_threads; ++i ) bli_l3_thrinfo_free( threads[i] ); bli_free_intl( threads ); } +#endif diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 7eac72298..71dea7645 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -61,24 +61,22 @@ // thrinfo_t APIs specific to level-3 operations. // +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node @@ -96,15 +94,37 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread + ); + +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ); + +// ----------------------------------------------------------------------------- + +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ); + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx ); void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ); +#endif diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 7be9c6a58..0148428df 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -84,10 +84,10 @@ void bli_gemm_blk_var3 c, cntx, bli_cntl_sub_node( cntl ), - bli_thrinfo_sub_node( thread) + bli_thrinfo_sub_node( thread ) ); - bli_thread_ibarrier( thread ); + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 3f3773418..b3494b174 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -46,14 +46,21 @@ cntl_t* bli_gemm_cntl_create if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* gemm_cntl_bp_ke = bli_gemm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( @@ -66,7 +73,7 @@ cntl_t* bli_gemm_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - gemm_cntl_bp_ke + gemm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 0782d7272..324655655 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -85,13 +85,19 @@ void bli_gemm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx ); - // Invoke the internal back-end. + // Create the first node in the thrinfo_t tree for each thread. +//thrinfo_t** infos = bli_l3_thrinfo_create_full_paths( cntx ); +//bli_l3_thrinfo_print_paths( infos ); +//exit(1); +//cntl = bli_gemm_cntl_create( BLIS_GEMM ); + //thrinfo_t** infos = bli_l3_thrinfo_create_roots( cntx, cntl ); + + // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -99,10 +105,12 @@ void bli_gemm_front beta, &c_local, cntx, - cntl, - infos + cntl ); +//bli_l3_thrinfo_print_paths( infos ); +//exit(1); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Free the thrinfo_t structures. + //bli_l3_thrinfo_free_paths( infos ); } diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 18e531879..b24f2a25d 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -50,7 +50,6 @@ void bli_gemm_int obj_t b_local; obj_t c_local; gemm_voft f; - ind_t im; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -102,17 +101,22 @@ void bli_gemm_int bli_obj_scalar_apply_scalar( beta, &c_local ); } + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); + // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. - im = bli_cntx_get_ind_method( cntx ); - - if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + ind_t im = bli_cntx_get_ind_method( cntx ); + + if ( im != BLIS_NAT ) + { + if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + } } // Invoke the variant. diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index ed7e03b9c..8bede097b 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -92,13 +92,12 @@ void bli_hemm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -106,10 +105,7 @@ void bli_hemm_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index f72dedf87..7350b5785 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -110,14 +110,14 @@ void bli_her2k_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx ); - // Invoke the internal back-end. + // Invoke herk twice, using beta only the first time. + + // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -125,13 +125,11 @@ void bli_her2k_front beta, &c_local, cntx, - cntl, - infos + cntl ); bli_l3_thread_decorator ( - n_threads, bli_gemm_int, &alpha_conj, &b_local, @@ -139,12 +137,9 @@ void bli_her2k_front &BLIS_ONE, &c_local, cntx, - cntl, - infos + cntl ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 3abfa9baf..7fcd2d356 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -90,13 +90,12 @@ void bli_herk_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -104,12 +103,9 @@ void bli_herk_front beta, &c_local, cntx, - cntl, - infos + cntl ); - bli_l3_thrinfo_free_paths( infos, n_threads ); - // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index b864ce06a..cd2f3a20e 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -91,13 +91,12 @@ void bli_symm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_GEMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -105,10 +104,7 @@ void bli_symm_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 936c43635..47ce91795 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -91,14 +91,14 @@ void bli_syr2k_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx ); + // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -106,13 +106,11 @@ void bli_syr2k_front beta, &c_local, cntx, - cntl, - infos + cntl ); bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &b_local, @@ -120,10 +118,7 @@ void bli_syr2k_front &BLIS_ONE, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 8b379ab0e..f037eb1c1 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -84,13 +84,12 @@ void bli_syrk_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_HERK, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -98,10 +97,7 @@ void bli_syrk_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 689acbb72..c7231c839 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -134,13 +134,12 @@ void bli_trmm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -148,10 +147,7 @@ void bli_trmm_front &BLIS_ZERO, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index e9e9261f0..cf97bbcf2 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -133,13 +133,12 @@ void bli_trmm3_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRMM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_gemm_int, alpha, &a_local, @@ -147,10 +146,7 @@ void bli_trmm3_front beta, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index 9d726389f..7b428c8ef 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -87,7 +87,8 @@ void bli_trsm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_ibarrier( thread ); + //bli_thread_ibarrier( thread ); + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index b4f7422ba..78bd5eeb9 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -50,14 +50,21 @@ cntl_t* bli_trsm_l_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create ( @@ -70,7 +77,7 @@ cntl_t* bli_trsm_l_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - trsm_cntl_bp_ke + trsm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. @@ -122,14 +129,21 @@ cntl_t* bli_trsm_r_cntl_create { void* macro_kernel_p = bli_trsm_xx_ker_var2; - // Create a node for the macro-kernel. - cntl_t* trsm_cntl_bp_ke = bli_trsm_cntl_obj_create + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create ( - BLIS_NR, // bszid not used by macro-kernel. - macro_kernel_p, + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); + // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create ( @@ -142,7 +156,7 @@ cntl_t* bli_trsm_r_cntl_create FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_A_BLOCK, - trsm_cntl_bp_ke + trsm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 3466d2d18..95c2d6aab 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -119,13 +119,12 @@ void bli_trsm_front // Set the operation family id in the context. bli_cntx_set_family( BLIS_TRSM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx ); // Invoke the internal back-end. bli_l3_thread_decorator ( - n_threads, bli_trsm_int, alpha, &a_local, @@ -133,10 +132,7 @@ void bli_trsm_front alpha, &c_local, cntx, - cntl, - infos + cntl ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); } diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index e6614cb3f..796af7866 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -117,6 +117,9 @@ void bli_trsm_int // FGVZ->TMS: Is this barrier still needed? bli_thread_obarrier( thread ); + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); + // Extract the function pointer from the current control tree node. f = bli_cntl_var_func( cntl ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 3b39befe4..2b45a5de3 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -107,9 +107,13 @@ void bli_cntl_free thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread ); - // Recursively free all memory associated with the sub-node and its - // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + // Only recurse if the current thrinfo_t node has a child. + if ( thread_sub_node != NULL ) + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free( cntl_sub_node, thread_sub_node ); + } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index f2885cca3..31e995e1b 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -341,6 +341,37 @@ pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) } #endif +dim_t bli_cntx_get_num_threads( cntx_t* cntx ) +{ + return bli_cntx_jc_way( cntx ) * + bli_cntx_pc_way( cntx ) * + bli_cntx_ic_way( cntx ) * + bli_cntx_jr_way( cntx ) * + bli_cntx_ir_way( cntx ); +} + +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +{ + dim_t n_threads_in = 1; + + for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) ) + { + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t cur_way; + + // We assume bszid is in {KR,MR,NR,MC,KC,NR} if it is not + // BLIS_NO_PART. + if ( bszid != BLIS_NO_PART ) + cur_way = bli_cntx_way_for_bszid( bszid, cntx ); + else + cur_way = 1; + + n_threads_in *= cur_way; + } + + return n_threads_in; +} + // ----------------------------------------------------------------------------- #if 1 @@ -663,6 +694,96 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c, bli_cntx_set_schema_c( schema_c, cntx ); } +void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) +{ + dim_t jc, pc, ic, jr, ir; + +#ifdef BLIS_ENABLE_MULTITHREADING + jc = bli_env_read_nway( "BLIS_JC_NT" ); + //pc = bli_env_read_nway( "BLIS_KC_NT" ); + pc = 1; + ic = bli_env_read_nway( "BLIS_IC_NT" ); + jr = bli_env_read_nway( "BLIS_JR_NT" ); + ir = bli_env_read_nway( "BLIS_IR_NT" ); +#else + jc = 1; + pc = 1; + ic = 1; + jr = 1; + ir = 1; +#endif + + if ( l3_op == BLIS_TRMM ) + { + // We reconfigure the paralelism from trmm_r due to a dependency in + // the jc loop. (NOTE: This dependency does not exist for trmm3 ) + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + pc, + ic, + jr * jc, + ir, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } + } + else if ( l3_op == BLIS_TRSM ) + { + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + jc * ic * jr, + 1, + 1, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + 1, + ic * jr * ir, + 1, + cntx + ); + } + } + else // if ( l3_op == BLIS_TRSM ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } +} + + // ----------------------------------------------------------------------------- bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 21f9c0fe0..6aed68111 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + dim_t* thrloop; + membrk_t* membrk; } cntx_t; */ @@ -127,6 +129,36 @@ typedef struct cntx_s \ ( (cntx)->membrk ) +#define bli_cntx_thrloop( cntx ) \ +\ + ( (cntx)->thrloop ) + +#if 1 +#define bli_cntx_jc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NC ] ) + +#define bli_cntx_pc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_KC ] ) + +#define bli_cntx_ic_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MC ] ) + +#define bli_cntx_jr_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NR ] ) + +#define bli_cntx_ir_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MR ] ) +#endif + +#define bli_cntx_way_for_bszid( bszid, cntx ) \ +\ + ( (cntx)->thrloop[ bszid ] ) + // cntx_t modification (fields only) #define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \ @@ -199,6 +231,16 @@ typedef struct cntx_s (cntx_p)->membrk = _membrk; \ } +#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ +{ \ + (cntx_p)->thrloop[ BLIS_NC ] = jc_; \ + (cntx_p)->thrloop[ BLIS_KC ] = pc_; \ + (cntx_p)->thrloop[ BLIS_MC ] = ic_; \ + (cntx_p)->thrloop[ BLIS_NR ] = jr_; \ + (cntx_p)->thrloop[ BLIS_MR ] = ir_; \ + (cntx_p)->thrloop[ BLIS_KR ] = 1; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -356,6 +398,8 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); // set functions @@ -390,6 +434,9 @@ void bli_cntx_set_pack_schema_b( pack_t schema_b, cntx_t* cntx ); void bli_cntx_set_pack_schema_c( pack_t schema_c, cntx_t* cntx ); +void bli_cntx_set_thrloop_from_env( opid_t l3_op, + side_t side, + cntx_t* cntx ); // other query functions diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 191db4834..3a36378ae 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -145,6 +145,10 @@ void bli_free_align int8_t* p_byte; void** p_addr; + // If the pointer to free is NULL, it was obviously not aligned and + // does not need to be freed. + if ( p == NULL ) return; + // Since the bli_malloc_pool() function returned the aligned pointer, // we have to first recover the original pointer before we can free // the memory. diff --git a/frame/base/bli_mem.c b/frame/base/bli_mem.c deleted file mode 100644 index 83b936aae..000000000 --- a/frame/base/bli_mem.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2016 Hewlett Packard Enterprise Development LP - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS -pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif - -static membrk_t global_membrk; - -// ----------------------------------------------------------------------------- - -membrk_t* bli_mem_global_membrk( void ) -{ - return &global_membrk; -} - -siz_t bli_mem_pool_size( packbuf_t buf_type ) -{ - siz_t r_val; - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // We don't (yet) track the amount of general-purpose - // memory that is currently allocated. - r_val = 0; - } - else - { - dim_t pool_index; - pool_t* pool; - - // Acquire the pointer to the pool corresponding to the buf_type - // provided. - pool_index = bli_packbuf_index( buf_type ); - pool = bli_membrk_pool( pool_index, &global_membrk ); - - // Compute the pool "size" as the product of the block size - // and the number of blocks in the pool. - r_val = bli_pool_block_size( pool ) * - bli_pool_num_blocks( pool ); - } - - return r_val; -} - -// ----------------------------------------------------------------------------- - -static bool_t bli_mem_is_init = FALSE; - -void bli_mem_init( void ) -{ - cntx_t cntx; - - // If the initialization flag is TRUE, we know the API is already - // initialized, so we can return early. - if ( bli_mem_is_init == TRUE ) return; - - // Create and initialize a context for gemm so we have something - // to pass into bli_mem_init_pools(). - bli_gemm_cntx_init( &cntx ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // initialization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == FALSE ) - { - // Initialize the global membrk_t object and its memory pools. - bli_membrk_init( &cntx, &global_membrk ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Finalize the temporary gemm context. - bli_gemm_cntx_finalize( &cntx ); -} - -void bli_mem_reinit( cntx_t* cntx ) -{ -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // If for some reason the memory pools have not yet been - // initialized (unlikely), we emulate the body of bli_mem_init(). - if ( bli_mem_is_init == FALSE ) - { - // Initialize the global membrk_t object and its memory pools. - bli_membrk_init( cntx, &global_membrk ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - else - { - // Reinitialize the global membrk_t object's memory pools. - bli_membrk_reinit_pools( cntx, &global_membrk ); - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -void bli_mem_finalize( void ) -{ - // If the initialization flag is FALSE, we know the API is already - // uninitialized, so we can return early. - if ( bli_mem_is_init == FALSE ) return; - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // finalization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == TRUE ) - { - // Finalize the global membrk_t object and its memory pools. - bli_membrk_finalize( &global_membrk ); - - // After finalization, mark the API as uninitialized. - bli_mem_is_init = FALSE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -bool_t bli_mem_is_initialized( void ) -{ - return bli_mem_is_init; -} - diff --git a/frame/base/bli_mem.c.prev b/frame/base/bli_mem.c.prev deleted file mode 100644 index 7a16e8732..000000000 --- a/frame/base/bli_mem.c.prev +++ /dev/null @@ -1,366 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -#ifdef BLIS_ENABLE_PTHREADS -extern pthread_mutex_t mem_manager_mutex; -#endif - -// Declare one memory pool structure for each block size/shape we want to -// be able to allocate. - -static pool_t pools[3]; - - -// Physically contiguous memory for each pool. -// -// Generally speaking, the pool sizes are computed in a sub-header of blis.h -// as follows: -// -// BLIS_MK_POOL_SIZE = BLIS_MAXIMUM_MC_? * BLIS_MAXIMUM_KC_? * BLIS_SIZEOF_? -// -// where "?" is the datatype that results in the largest pool size. The -// constants BLIS_KN_POOL_SIZE and BLIS_MN_POOL_SIZE are computed in a -// similar manner. All constants are computed with appropriate "padding" -// to ensure enough space given the alignments required by bli_config.h. -// - -static void* pool_mk_blk_ptrs[ BLIS_NUM_MC_X_KC_BLOCKS ]; -static void* pool_kn_blk_ptrs[ BLIS_NUM_KC_X_NC_BLOCKS ]; -static void* pool_mn_blk_ptrs[ BLIS_NUM_MC_X_NC_BLOCKS ]; - -#define BLIS_USE_HEAP - -#ifdef BLIS_USE_HEAP -static char* pool_mk_mem = NULL; -static char* pool_kn_mem = NULL; -static char* pool_mn_mem = NULL; -#else -static char pool_mk_mem[ BLIS_MK_POOL_SIZE ]; -static char pool_kn_mem[ BLIS_KN_POOL_SIZE ]; -static char pool_mn_mem[ BLIS_MN_POOL_SIZE ]; -#endif - - - -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ) -{ - siz_t block_size; - dim_t pool_index; - pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; - - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffer requests, such as those used by level-2 - // operations, using bli_malloc() is sufficient, since using - // physically contiguous memory is not as important there. - block = bli_malloc( req_size ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), and - // - the size of the requested region. - // NOTE: We do not initialize the pool field since this block did not - // come from a contiguous memory pool. - bli_mem_set_buffer( block, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_size( req_size, mem ); - } - else - { - // This branch handles cases where the memory block needs to come - // from one of the contiguous memory pools. - - // Map the requested packed buffer type to a zero-based index, which - // we then use to select the corresponding memory pool. - pool_index = bli_packbuf_index( buf_type ); - pool = &pools[ pool_index ]; - - // Unconditionally perform error checking on the memory pool. - { - err_t e_val; - - // Make sure that the requested matrix size fits inside of a block - // of the corresponding pool. - e_val = bli_check_requested_block_size_for_pool( req_size, pool ); - bli_check_error_code( e_val ); - - // Make sure that the pool contains at least one block to check out - // to the thread. - e_val = bli_check_if_exhausted_pool( pool ); - bli_check_error_code( e_val ); - } - - // Access the block pointer array from the memory pool data structure. - block_ptrs = bli_pool_block_ptrs( pool ); - - - // BEGIN CRITICAL SECTION -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - { - - // Query the index of the contiguous memory block that resides at the - // "top" of the pool. - i = bli_pool_top_index( pool ); - - // Extract the address of the top block from the block pointer array. - block = block_ptrs[i]; - - // Clear the entry from the block pointer array. (This is actually not - // necessary.) - //block_ptrs[i] = NULL; - - // Decrement the top of the memory pool. - bli_pool_dec_top_index( pool ); - - - // END CRITICAL SECTION - } -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Query the size of the blocks in the pool so we can store it in the - // mem_t object. - block_size = bli_pool_block_size( pool ); - - // Initialize the mem_t object with: - // - the address of the memory block, - // - the buffer type (a packbuf_t value), - // - the address of the memory pool to which it belongs, and - // - the size of the contiguous memory block (NOT the size of the - // requested region). - bli_mem_set_buffer( block, mem ); - bli_mem_set_buf_type( buf_type, mem ); - bli_mem_set_pool( pool, mem ); - bli_mem_set_size( block_size, mem ); - } -} - - -void bli_mem_release( mem_t* mem ) -{ - packbuf_t buf_type; - pool_t* pool; - void** block_ptrs; - void* block; - gint_t i; - - // Extract the address of the memory block we are trying to - // release. - block = bli_mem_buffer( mem ); - - // Extract the buffer type so we know what kind of memory was allocated. - buf_type = bli_mem_buf_type( mem ); - - if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) - { - // For general-use buffers, we allocate with bli_malloc(), and so - // here we need to call bli_free(). - bli_free( block ); - } - else - { - // This branch handles cases where the memory block came from one - // of the contiguous memory pools. - - // Extract the pool from which the block was allocated. - pool = bli_mem_pool( mem ); - - // Extract the block pointer array associated with the pool. - block_ptrs = bli_pool_block_ptrs( pool ); - - - // BEGIN CRITICAL SECTION -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - { - - // Increment the top of the memory pool. - bli_pool_inc_top_index( pool ); - - // Query the newly incremented top index. - i = bli_pool_top_index( pool ); - - // Place the address of the block back onto the top of the memory pool. - block_ptrs[i] = block; - - - // END CRITICAL SECTION - } -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - } - - - // Clear the mem_t object so that it appears unallocated. We clear: - // - the buffer field, - // - the pool field, and - // - the size field. - // NOTE: We do not clear the buf_type field since there is no - // "uninitialized" value for packbuf_t. - bli_mem_set_buffer( NULL, mem ); - bli_mem_set_pool( NULL, mem ); - bli_mem_set_size( 0, mem ); -} - - -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ) -{ - bli_mem_acquire_m( req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); -} - - - -void bli_mem_init() -{ - dim_t index_a; - dim_t index_b; - dim_t index_c; - -#ifdef BLIS_USE_HEAP - pool_mk_mem = bli_malloc( BLIS_MK_POOL_SIZE ); - pool_kn_mem = bli_malloc( BLIS_KN_POOL_SIZE ); - pool_mn_mem = bli_malloc( BLIS_MN_POOL_SIZE ); -#endif - - // Map each of the packbuf_t values to an index starting at zero. - index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); - index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); - index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); - - // Initialize contiguous memory pool for MC x KC blocks. - bli_mem_init_pool( pool_mk_mem, - BLIS_MK_BLOCK_SIZE, - BLIS_NUM_MC_X_KC_BLOCKS, - pool_mk_blk_ptrs, - &pools[ index_a ] ); - - // Initialize contiguous memory pool for KC x NC blocks. - bli_mem_init_pool( pool_kn_mem, - BLIS_KN_BLOCK_SIZE, - BLIS_NUM_KC_X_NC_BLOCKS, - pool_kn_blk_ptrs, - &pools[ index_b ] ); - - // Initialize contiguous memory pool for MC x NC blocks. - bli_mem_init_pool( pool_mn_mem, - BLIS_MN_BLOCK_SIZE, - BLIS_NUM_MC_X_NC_BLOCKS, - pool_mn_blk_ptrs, - &pools[ index_c ] ); -} - - -void bli_mem_init_pool( char* pool_mem, - siz_t block_size, - dim_t num_blocks, - void** block_ptrs, - pool_t* pool ) -{ - const siz_t align_size = BLIS_CONTIG_ADDR_ALIGN_SIZE; - dim_t i; - - // If the pool starting address is not already aligned, advance it - // accordingly. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - // Notice that this works even if the alignment is not a power of two. - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - - // Step through the memory pool, beginning with the aligned address - // determined above, assigning pointers to the beginning of each block_size - // bytes to the ith element of the block_ptrs array. - for ( i = 0; i < num_blocks; ++i ) - { - // Save the address of pool, which is guaranteed to be aligned. - block_ptrs[i] = pool_mem; - - // Advance pool by one block. - pool_mem += block_size; - - // Advance pool a bit further if needed in order to get to the - // beginning of an alignment boundary. - if ( bli_is_unaligned_to( ( uintptr_t )pool_mem, ( uintptr_t )align_size ) ) - { - pool_mem += ( ( uintptr_t )align_size - - ( ( uintptr_t )pool_mem % align_size ) ); - } - } - - // Now that we have initialized the array of pointers to the individual - // blocks in the pool, we initialize a pool_t data structure so that we - // can easily manage this pool. - bli_pool_init( num_blocks, - block_size, - block_ptrs, - pool ); -} - - - -void bli_mem_finalize() -{ - // Nothing to do. - -#ifdef BLIS_USE_HEAP - bli_free( pool_mk_mem ); - bli_free( pool_kn_mem ); - bli_free( pool_mn_mem ); -#endif - -} - diff --git a/frame/include/bli_auxinfo_macro_defs.h b/frame/include/bli_auxinfo_macro_defs.h deleted file mode 100644 index aee1869a0..000000000 --- a/frame/include/bli_auxinfo_macro_defs.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_AUXINFO_MACRO_DEFS_H -#define BLIS_AUXINFO_MACRO_DEFS_H - - -// auxinfo_t field query - -#define bli_auxinfo_schema_a( auxinfo ) ( (auxinfo)->schema_a ) -#define bli_auxinfo_schema_b( auxinfo ) ( (auxinfo)->schema_b ) - -#define bli_auxinfo_next_a( auxinfo ) ( (auxinfo)->a_next ) -#define bli_auxinfo_next_b( auxinfo ) ( (auxinfo)->b_next ) - -#define bli_auxinfo_is_a( auxinfo ) ( (auxinfo)->is_a ) -#define bli_auxinfo_is_b( auxinfo ) ( (auxinfo)->is_b ) - - -// auxinfo_t field modification - -#define bli_auxinfo_set_schema_a( schema, auxinfo ) { (auxinfo).schema_a = schema; } -#define bli_auxinfo_set_schema_b( schema, auxinfo ) { (auxinfo).schema_b = schema; } - -#define bli_auxinfo_set_next_a( a_p, auxinfo ) { (auxinfo).a_next = a_p; } -#define bli_auxinfo_set_next_b( b_p, auxinfo ) { (auxinfo).b_next = b_p; } - -#define bli_auxinfo_set_next_ab( a_p, b_p, auxinfo ) \ -{ \ - bli_auxinfo_set_next_a( a_p, auxinfo ); \ - bli_auxinfo_set_next_b( b_p, auxinfo ); \ -} - -#define bli_auxinfo_set_is_a( is, auxinfo ) { (auxinfo).is_a = is; } -#define bli_auxinfo_set_is_b( is, auxinfo ) { (auxinfo).is_b = is; } - - -#endif - diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index f4e3e4aa0..99b2c601d 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -639,6 +639,21 @@ typedef enum #define BLIS_NUM_UKR_IMPL_TYPES 4 +#if 0 +typedef enum +{ + BLIS_JC_IDX = 0, + BLIS_PC_IDX, + BLIS_IC_IDX, + BLIS_JR_IDX, + BLIS_IR_IDX, + BLIS_PR_IDX, +} thridx_t; +#endif + +#define BLIS_NUM_LOOPS 6 + + // -- Operation ID type -- typedef enum @@ -950,6 +965,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + dim_t thrloop[ BLIS_NUM_LOOPS ]; + membrk_t* membrk; } cntx_t; diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 6b4d2de1a..593f8d7fa 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -41,6 +41,12 @@ #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" + +// thrcomm_t query (field only) + +#define bli_thrcomm_num_threads( comm ) ( (comm)->n_threads ) + + // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); void bli_thrcomm_free( thrcomm_t* communicator ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 7c1fe69f9..68d9d7a29 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -201,7 +201,6 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -209,20 +208,28 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); - thrinfo_t* thread_i = thread[omp_id]; + dim_t id = omp_get_thread_num(); cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + func ( alpha, @@ -232,12 +239,19 @@ void bli_l3_thread_decorator c, cntx, cntl_use, - thread[omp_id] + thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 0f2707d91..230b63905 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -136,7 +136,8 @@ typedef struct thread_data obj_t* c; cntx_t* cntx; cntl_t* cntl; - thrinfo_t* thread; + dim_t id; + thrcomm_t* gl_comm; } thread_data_t; // Entry point for additional threads @@ -151,13 +152,18 @@ void* bli_l3_thread_entry( void* data_void ) obj_t* c = data->c; cntx_t* cntx = data->cntx; cntl_t* cntl = data->cntl; - thrinfo_t* thread_i = data->thread; + dim_t id = data->id; + thrcomm_t* gl_comm = data->gl_comm; cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + data->func ( alpha, @@ -171,14 +177,16 @@ void* bli_l3_thread_entry( void* data_void ) ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); return NULL; } void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -186,50 +194,51 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { - pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); - for ( int i = 1; i < n_threads; i++ ) + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t id = n_threads - 1; 0 <= id; id-- ) { // Set up thread data for additional threads (beyond thread 0). - datas[i].func = func; - datas[i].alpha = alpha; - datas[i].a = a; - datas[i].b = b; - datas[i].beta = beta; - datas[i].c = c; - datas[i].cntx = cntx; - datas[i].cntl = cntl; - datas[i].thread = thread[i]; + datas[id].func = func; + datas[id].alpha = alpha; + datas[id].a = a; + datas[id].b = b; + datas[id].beta = beta; + datas[id].c = c; + datas[id].cntx = cntx; + datas[id].cntl = cntl; + datas[id].id = id; + datas[id].gl_comm = gl_comm; - // Spawn additional threads. - pthread_create( &pthreads[i], NULL, &bli_l3_thread_entry, &datas[i] ); - } - - - // The main thread executes this. - { - cntl_t* cntl_use; - - // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); - - // Thread 0 simply executes func. - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); - - // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread[0] ); + // Spawn additional threads for ids greater than 1. + if ( id != 0 ) + pthread_create( &pthreads[id], NULL, &bli_l3_thread_entry, &datas[id] ); + else + bli_l3_thread_entry( ( void* )(&datas[0]) ); } + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). // Thread 0 waits for additional threads to finish. - for ( int i = 1; i < n_threads; i++) + for ( dim_t id = 1; id < n_threads; id++ ) { - pthread_join( pthreads[i], NULL ); + pthread_join( pthreads[id], NULL ); } bli_free_intl( pthreads ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index 99de67220..c038f59a0 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -73,7 +73,6 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) void bli_l3_thread_decorator ( - dim_t n_threads, l3int_t func, obj_t* alpha, obj_t* a, @@ -81,17 +80,25 @@ void bli_l3_thread_decorator obj_t* beta, obj_t* c, cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + cntl_t* cntl ) { - thrinfo_t* thread_i = thread[0]; + // For sequential execution, we use only one thread. + dim_t n_threads = 1; + dim_t id = 0; + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); cntl_t* cntl_use; + thrinfo_t* thread; // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + // Create the root node of the thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + func ( alpha, @@ -101,11 +108,18 @@ void bli_l3_thread_decorator c, cntx, cntl_use, - thread[0] + thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread_i ); + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). } diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 43f0eaf8b..d42744162 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -78,8 +78,8 @@ void bli_thread_get_range_sub dim_t* end ) { - dim_t n_way = thread->n_way; - dim_t work_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; @@ -511,8 +511,8 @@ siz_t bli_thread_get_range_weighted_sub dim_t* j_end_thr ) { - dim_t n_way = thread->n_way; - dim_t my_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t my_id = bli_thread_work_id( thread ); dim_t bf_left = n % bf; diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 10097c39e..5b9443587 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -173,16 +173,14 @@ typedef void (*l3int_t) // Level-3 thread decorator prototype void bli_l3_thread_decorator ( - dim_t n_threads, - l3int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - cntl_t* cntl, - thrinfo_t** thread + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl ); // Miscellaneous prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 4cf55b3d4..bad5c2772 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -38,11 +38,9 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ) { @@ -52,9 +50,8 @@ thrinfo_t* bli_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - free_comms, + free_comm, sub_node ); @@ -66,23 +63,19 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ) { - thread->ocomm = ocomm; - thread->ocomm_id = ocomm_id; - thread->icomm = icomm; - thread->icomm_id = icomm_id; - thread->n_way = n_way; - thread->work_id = work_id; - thread->free_comms = free_comms; + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->free_comm = free_comm; - thread->sub_node = sub_node; + thread->sub_node = sub_node; } void bli_thrinfo_init_single @@ -94,7 +87,6 @@ void bli_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, 0, FALSE, @@ -102,3 +94,178 @@ void bli_thrinfo_init_single ); } +// ----------------------------------------------------------------------------- + +#include "assert.h" + +#define BLIS_NUM_STATIC_COMMS 18 + +thrinfo_t* bli_thrinfo_create_for_cntl + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ) +{ + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; + + thrinfo_t* thread_chl; + + bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); + + dim_t parent_nt_in = bli_thread_num_threads( thread_par ); + dim_t parent_n_way = bli_thread_n_way( thread_par ); + dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + dim_t parent_work_id = bli_thread_work_id( thread_par ); + + dim_t child_nt_in; + dim_t child_comm_id; + dim_t child_n_way; + dim_t child_work_id; + + // Sanity check: make sure the number of threads in the parent's + // communicator is divisible by the number of new sub-groups. + assert( parent_nt_in % parent_n_way == 0 ); + + // Compute: + // - the number of threads inside the new child comm, + // - the current thread's id within the new communicator, + // - the current thread's work id, given the ways of parallelism + // to be obtained within the next loop. + child_nt_in = bli_cntx_get_num_threads_in( cntx, cntl_chl ); + child_n_way = bli_cntx_way_for_bszid( bszid_chl, cntx ); + child_comm_id = parent_comm_id % child_nt_in; + child_work_id = child_comm_id / ( child_nt_in / child_n_way ); + + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) ); + else + new_comms = static_comms; + } + + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thread_obroadcast( thread_par, new_comms ); + + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_comm_id == 0 ) + new_comms[ parent_work_id ] = bli_thrcomm_create( child_nt_in ); + + bli_thread_obarrier( thread_par ); + + // All threads create a new thrinfo_t node using the communicator + // that was created by their chief, as identified by parent_work_id. + thread_chl = bli_thrinfo_create + ( + new_comms[ parent_work_id ], + child_comm_id, + child_n_way, + child_work_id, + TRUE, + NULL + ); + + bli_thread_obarrier( thread_par ); + + // The parent's chief thread frees the temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + bli_free_intl( new_comms ); + } + + return thread_chl; +} + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // If the sub-node of the thrinfo_t object is non-NULL, we don't + // need to create it, and will just use the existing sub-node as-is. + if ( bli_thrinfo_sub_node( thread ) != NULL ) return; + + // Create a new node (or, if needed, multiple nodes) and return the + // pointer to the (eldest) child. + thrinfo_t* thread_child = bli_thrinfo_rgrow + ( + cntx, + cntl, + bli_cntl_sub_node( cntl ), + thread + ); + + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_child, thread ); +} + +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ) +{ + thrinfo_t* thread_cur; + + // We must handle two cases: those where the next node in the + // control tree is a partitioning node, and those where it is + // a non-partitioning (ie: packing) node. + if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) + { + // Create the child thrinfo_t node corresponding to cntl_cur, + // with cntl_par being the parent. + thread_cur = bli_thrinfo_create_for_cntl + ( + cntx, + cntl_par, + cntl_cur, + thread_par + ); + } + else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) + { + // Recursively grow the thread structure and return the top-most + // thrinfo_t node of that segment. + thrinfo_t* thread_seg = bli_thrinfo_rgrow + ( + cntx, + cntl_par, + bli_cntl_sub_node( cntl_cur ), + thread_par + ); + + // Create a thrinfo_t node corresponding to cntl_cur. Notice that + // the free_comm field is set to FALSE, since cntl_cur is a + // non-partitioning node. The communicator used here will be + // freed when thread_seg, or one of its descendents, is freed. + thread_cur = bli_thrinfo_create + ( + bli_thrinfo_ocomm( thread_seg ), + bli_thread_ocomm_id( thread_seg ), + bli_cntx_get_num_threads_in( cntx, cntl_cur ), + bli_thread_ocomm_id( thread_seg ), + FALSE, + thread_seg + ); + + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_cur, thread_par ); + } + + return thread_cur; +} + diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 9c0b28575..93bf19e50 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -45,13 +45,6 @@ struct thrinfo_s // Our thread id within the ocomm thread communicator. dim_t ocomm_id; - // The thread communicator for the other threads sharing the same work - // at this level. - thrcomm_t* icomm; - - // Our thread id within the icomm thread communicator. - dim_t icomm_id; - // The number of distinct threads used to parallelize the loop. dim_t n_way; @@ -62,7 +55,7 @@ struct thrinfo_s // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. - bool_t free_comms; + bool_t free_comm; struct thrinfo_s* sub_node; }; @@ -71,30 +64,40 @@ typedef struct thrinfo_s thrinfo_t; // // thrinfo_t macros // NOTE: The naming of these should be made consistent at some point. +// (ie: bli_thrinfo_ vs. bli_thread_) // -#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) +// thrinfo_t query (field only) -#define bli_thread_n_way( t ) ( (t)->n_way ) -#define bli_thread_work_id( t ) ( (t)->work_id ) +#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) -#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) -#define bli_thread_am_ichief( t ) ( (t)->icomm_id == 0 ) +#define bli_thread_n_way( t ) ( (t)->n_way ) +#define bli_thread_work_id( t ) ( (t)->work_id ) +#define bli_thread_ocomm_id( t ) ( (t)->ocomm_id ) + +#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) +#define bli_thrinfo_needs_free_comm( t ) ( (t)->free_comm ) + +#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) + +// thrinfo_t query (complex) + +#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) + +// thrinfo_t modification + +#define bli_thrinfo_set_sub_node( _sub_node, thread ) \ +{ \ + (thread)->sub_node = _sub_node; \ +} + +// other thrinfo_t-related macros #define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \ (t)->ocomm_id, p ) -#define bli_thread_ibroadcast( t, p ) bli_thrcomm_bcast( (t)->icomm, \ - (t)->icomm_id, p ) #define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \ (t)->ocomm_id ) -#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( (t)->icomm, \ - (t)->icomm_id ) -#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) -#define bli_thrinfo_icomm( t ) ( (t)->icomm ) -#define bli_thrinfo_needs_free_comms( t ) ( (t)->free_comms ) - -#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) // // Prototypes for level-3 thrinfo functions not specific to any operation. @@ -104,11 +107,9 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ); @@ -117,11 +118,9 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - bool_t free_comms, + bool_t free_comm, thrinfo_t* sub_node ); @@ -130,9 +129,29 @@ void bli_thrinfo_init_single thrinfo_t* thread ); -void bli_thrinfo_free +// ----------------------------------------------------------------------------- + +thrinfo_t* bli_thrinfo_create_for_cntl ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ); + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, thrinfo_t* thread ); +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ); + #endif diff --git a/version b/version index 2bfe0beaa..0c62199f1 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.0-37 +0.2.1