diff --git a/CHANGELOG b/CHANGELOG index 539067456..a361ceac3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,1054 @@ -commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (HEAD -> master, tag: 0.2.0) +commit 866b2dde3f41760121115fb25f096d4344e8b4f9 (HEAD -> master, tag: 0.2.1) +Author: Field G. Van Zee +Date: Wed Oct 5 14:41:34 2016 -0500 + + Version file update (0.2.1) + +commit 87fddeab3c8a5ccb1bbf02e5f89db1464e459ba9 (origin/master) +Merge: 8696987 6f71cd3 +Author: Field G. Van Zee +Date: Wed Oct 5 13:35:01 2016 -0500 + + Merge branch 'compose' + +commit 6f71cd344951854e4cff9ea21bbdfe536e72611d (origin/compose) +Merge: c0630c4 8d55033 +Author: Field G. Van Zee +Date: Tue Oct 4 15:53:46 2016 -0500 + + Merge pull request #94 from flame/distcomm + + Implemented distributed thrinfo_t management. + +commit 86969873b5b861966d717d8f9f370af39e3d9de6 +Author: Field G. Van Zee +Date: Tue Oct 4 14:24:59 2016 -0500 + + Reclassified amaxv operation as a level-1v kernel. + + Details: + - Moved amaxv from being a utility operation to being a level-1v operation. + This includes the establishment of a new amaxv kernel to live beside all + of the other level-1v kernels. + - Added two new functions to bli_part.c: + bli_acquire_mij() + bli_acquire_vi() + The first acquires a scalar object for the (i,j) element of a matrix, + and the second acquires a scalar object for the ith element of a vector. + - Added integer support to bli_getsc level-0 operation. This involved + adding integer support to the bli_*gets level-0 scalar macros. + - Added a new test module to test amaxv as a level-1v operation. The test + module works by comparing the value identified by bli_amaxv() to the + the value found from a reference-like code local to the test module + source file. In other words, it (intentionally) does not guarantee the + same index is found; only the same value. This allows for different + implementations in the case where a vector contains two or more elements + containing exactly the same floating point value (or values, in the case + of the complex domain). + - Removed the directory frame/include/old/. + +commit 8d55033c966feed99fcca2a58017c3ab5b1646dc (origin/distcomm) +Author: Field G. Van Zee +Date: Tue Sep 27 15:20:58 2016 -0500 + + Implemented distributed thrinfo_t management. + + Details: + - Implemented Ricardo Magana's distributed thread info/communicator + management. Rather that fully construct the thrinfo_t structures, from + root to leaf, prior to spawning threads, the threads individually + construct their thrinfo_t trees (or, chains), and do so incrementally, + as needed, reusing the same structure nodes during subsequent blocked + variant iterations. This required moving the initial creation of the + thrinfo_t structure (now, the root nodes) from the _front() functions + to the bli_l3_thread_decorator(). The incremental "growing" of the tree + is performed in the internal back-end (ie: _int()) function, and so + mostly invisible. Also, the incremental growth of the thrinfo_t tree is + done as a function of the current and parent control tree nodes (as well + as the parent thrinfo_t node), further reinforcing the parallel + relationship between the two data structures. + - Removed the "inner" communicator from thrinfo_t structure definition, + as well as its id. Changed all APIs accordingly. Renamed + bli_thrinfo_needs_free_comms() to bli_thrinfo_needs_free_comm(). + - Defined bli_l3_thrinfo_print_paths(), which prints the information + in an array of thrinfo_t* structure pointers. (Used only as a + debugging/verification tool.) + - Deprecated the following thrinfo_t creation functions: + bli_packm_thrinfo_create() + bli_l3_thrinfo_create() + because they are no longer used. bli_thrinfo_create() is now called + directly when creating thrinfo_t nodes. + +commit fd04869ae4d4a3b0ebb9052557c296456bce7c0d +Author: Field G. Van Zee +Date: Tue Sep 27 14:14:11 2016 -0500 + + Changed configure's 'omp' threading to 'openmp'. + + Details: + - Changed the configure script so that the expected string argument to the + -t (or --enable-threading=) option that enables OpenMP multithreading is + 'openmp'. The previous expected string, 'omp', is still supported but + should be considered deprecated. + +commit 9424af87209e4e435e2e742430945152690170b0 +Merge: efa7341 c0630c4 +Author: Field G. Van Zee +Date: Tue Sep 27 12:51:08 2016 -0500 + + Merge branch 'compose' + +commit efa7341df0b0115926aa8a6e8a4ebfb24fdbf11e +Merge: 121c39d e1453f6 +Author: Field G. Van Zee +Date: Fri Sep 16 11:01:57 2016 -0500 + + Merge pull request #92 from ShadenSmith/readme_fix + + Fixes broken URL in README.md + +commit e1453f68f6afd90ae9a29b7a5faa46aa79bbf741 +Author: Shaden Smith +Date: Fri Sep 16 09:29:28 2016 -0500 + + Fixes broken URL in README.md + +commit c0630c4024b08750043a2942a3e8a037aa6b6259 (compose) +Author: Field G. Van Zee +Date: Mon Sep 12 13:59:02 2016 -0500 + + Added debugging printf()'s to bli_l3_thrinfo.c. + + Details: + - Added optional printf() statements to print out thread communicator + info as the thrinfo_t structure is built in bli_l3_thrinfo.c. + - Minor changes to frame/thread/bli_thrinfo.h. + +commit 7b3bf1ffcd7160ccbf6c2518af6d88f6742e4977 +Merge: 3550981 121c39d +Author: Field G. Van Zee +Date: Tue Sep 6 15:47:13 2016 -0500 + + Merge branch 'master' into compose + +commit 121c39d455f2db6f7ce6802ba7f73ad5e088c68c +Author: Field G. Van Zee +Date: Mon Sep 5 13:11:42 2016 -0500 + + Added complex gemm micro-kernels for haswell. + + Details: + - Defined cgemm (3x8) and zgemm (3x4) micro-kernels for haswell-based + architectures. As with their real domain brethren, these kernels perfer + row storage, (though this doesn't affect most users due to high-level + optimizations in most level-3 operations that induce a transpose to + whatever storage preference the kernel may have). + +commit 35509818cbea1598b123421f81c42120889a03c3 +Author: Field G. Van Zee +Date: Wed Aug 31 17:34:15 2016 -0500 + + Added, moved some thread barriers. + + Details: + - Removed thread barriers from the end of the loop bodies of + bli_gemm_blk_var1(), bli_gemm_blk_var2(), bli_trsm_blk_var1(), + and bli_trsm_blk_var2(). + - Moved the thread barrier at the end of bli_packm_int() to the + end of bli_l3_packm(), and added missing barriers to that function. + - Removed the no longer necessary (and now incorrect) ochief guard + in bli_gemm3m3_packa() on the bli_obj_scalar_reset() on C. + - Thanks to Tyler Smith for help with these changes. + +commit abd61f9fa75d77a96d1491b3e035451ee73238fe +Author: Field G. Van Zee +Date: Tue Aug 30 12:34:19 2016 -0500 + + Updated BLIS4 TOMS citation in README.md. + +commit 701b9aa3ff028decbf90efac0dca5bd64fe26269 +Author: Field G. Van Zee +Date: Fri Aug 26 19:04:45 2016 -0500 + + Redesigned control tree infrastructure. + + Details: + - Altered control tree node struct definitions so that all nodes have the + same struct definition, whose primary fields consist of a blocksize id, + a variant function pointer, a pointer to an optional parameter struct, + and a pointer to a (single) sub-node. This unified control tree type is + now named cntl_t. + - Changed the way control tree nodes are connected, and what computation + they represent, such that, for example, packing operations are now + associated with nodes that are "inline" in the tree, rather than off- + shoot braches. The original tree for the classic Goto gemm algorithm was + expressed (roughly) as: + + blk_var2 -> blk_var3 -> blk_var1 -> ker_var2 + | | + -> packb -> packa + + and now, the same tree would look like: + + blk_var2 -> blk_var3 -> packb -> blk_var1 -> packa -> ker_var2 + + Specifically, the packb and packa nodes perform their respective packing + operations and then recurse (without any loop) to a subproblem. This means + there are now two kinds of level-3 control tree nodes: partitioning and + non-partitioning. The blocked variants are members of the former, because + they iteratively partition off submatrices and perform suboperations on + those partitions, while the packing variants belong to the latter group. + (This change has the effect of allowing greatly simplified initialization + of the nodes, which previously involved setting many unused node fields to + NULL.) + - Changed the way thrinfo_t tree nodes are arranged to mirror the new + connective structure of control trees. That is, packm nodes are no longer + off-shoot branches of the main algorithmic nodes, but rather connected + "inline". + - Simplified control tree creation functions. Partitioning nodes are created + concisely with just a few fields needing initialization. By contrast, the + packing nodes require additional parameters, which are stored in a + packm-specific struct that is tracked via the optional parameters pointer + within the control tree struct. (This parameter struct must always begin + with a uint64_t that contains the byte size of the struct. This allows + us to use a generic function to recursively copy control trees.) gemm, + herk, and trmm control tree creation continues to be consolidated into + a single function, with the operation family being used to select + among the parameter-agnostic macro-kernel wrappers. A single routine, + bli_cntl_free(), is provided to free control trees recursively, whereby + the chief thread within a groups release the blocks associated with + mem_t entries back to the memory broker from which they were acquired. + - Updated internal back-ends, e.g. bli_gemm_int(), to query and call the + function pointer stored in the current control tree node (rather than + index into a local function pointer array). Before being invoked, these + function pointers are first cast to a gemm_voft (for gemm, herk, or trmm + families) or trsm_voft (for trsm family) type, which is defined in + frame/3/bli_l3_var_oft.h. + - Retired herk and trmm internal back-ends, since all execution now flows + through gemm or trsm blocked variants. + - Merged forwards- and backwards-moving variants by querying the direction + from routines as a function of the variant's matrix operands. gemm and + herk always move forward, while trmm and trsm move in a direction that + is dependent on which operand (a or b) is triangular. + - Added functions bli_thread_get_range_mdim(), bli_thread_get_range_ndim(), + each of which takes additional arguments and hides complexity in managing + the difference between the way ranges are computed for the four families + of operations. + - Simplified level-3 blocked variants according to the above changes, so that + the only steps taken are: + 1. Query partitioning direction (forwards or backwards). + 2. Prune unreferenced regions, if they exist. + 3. Determine the thread partitioning sub-ranges. + + 4. Determine the partitioning blocksize (passing in the partitioning + direction) + 5. Acquire the curren iteration's partitions for the matrices affected + by the current variants's partitioning dimension (m, k, n). + 6. Call the subproblem. + + - Instantiate control trees once per thread, per operation invocation. + (This is a change from the previous regime in which control trees were + treated as stateless objects, initialized with the library, and shared + as read-only objects between threads.) This once-per-thread allocation + is done primarily to allow threads to use the control tree as as place + to cache certain data for use in subsequent loop iterations. Presently, + the only application of this caching is a mem_t entry for the packing + blocks checked out from the memory broker (allocator). If a non-NULL + control tree is passed in by the (expert) user, then the tree is copied + by each thread. This is done in bli_l3_thread_decorator(), in + bli_thrcomm_*.c. + - Added a new field to the context, and opid_t which tracks the "family" + of the operation being executed. For example, gemm, hemm, and symm are + all part of the gemm family, while herk, syrk, her2k, and syr2k are + all part of the herk family. Knowing the operation's family is necessary + when conditionally executing the internal (beta) scalar reset on on + C in blocked variant 3, which is needed for gemm and herk families, + but must not be performed for the trmm family (because beta has only + been applied to the current row-panel of C after the first rank-kc + iteration). + - Reexpressed 3m3 induced method blocked variant in frame/3/gemm/ind + to comform with the new control tree design, and renamed the macro- + kernel codes corresponding to 3m2 and 4m1b. + - Renamed bli_mem.c (and its APIs) to bli_memsys.c, and renamed/relocated + bli_mem_macro_defs.h from frame/include to frame/base/bli_mem.h. + - Renamed/relocated bli_auxinfo_macro_defs.h from frame/include to + frame/base/bli_auxinfo.h. + - Fixed a minor bug whereby the storage-to-ukr-preference matching + optimization in the various level-3 front-ends was not being applied + properly when the context indicated that execution would be via an + induced method. (Before, we always checked the native micro-kernel + corresponding to the datatype being executed, whereas now we check + the native micro-kernel corresponding to the datatype's real projection, + since that is the micro-kernel that is actually used by induced methods. + - Added an option to the testsuite to skip the testing of native level-3 + complex implementations. Previously, it was always tested, provided that + the c/z datatypes were enabled. However, some configurations use + reference micro-kernels for complex datatypes, and testing these + implementations can slow down the testsuite considerably. + +commit 73517f522b69de429dd7f3df60a70c068149ab28 +Merge: c6f5c21 50293da +Author: Field G. Van Zee +Date: Tue Aug 23 13:46:59 2016 -0500 + + Merge branch 'master' into compose + +commit 50293da38d5f2b7be9bbc94b9e85aacb6a10f672 +Author: Field G. Van Zee +Date: Tue Aug 23 13:38:36 2016 -0500 + + Avoid compiling BLAS/CBLAS files when disabled. + + Details: + - Updated the top-level Makefile, build/config.mk.in template, and + configure script so that object files corresponding to source files + belonging to the BLAS compatibility layer are not compiled (or archived) + when the compatibility layer is disabled. (Same for CBLAS.) Thanks + to Devin Matthews for suggesting this optimization. + - Slight change to the way configure handles internal variables. Instead + of converting (overwriting) some, such as enable_blas2blis and + enable_cblas, from a "yes" or "no" to a "1" or "0" value, the latter are + now stored in new variables that live alongside the originals (with the + suffix "_01"). This is convenient since some values need to be + sed-substituted into the config.mk.in template, which requires "yes" or + "no", while some need to be written to the bli_config.h.in template, + which requires "0" or "1". + +commit c6f5c215ee793d03ea834469fc2adc53feaffc42 +Merge: d52cb76 16a4c7a +Author: Field G. Van Zee +Date: Mon Aug 22 17:33:02 2016 -0500 + + Merge branch 'master' into compose + +commit 16a4c7a823d60707ed9272f5d36e5c5d54c0ba4b +Author: Field G. Van Zee +Date: Fri Aug 19 11:38:36 2016 -0500 + + Fixed bugs in bli_mutex_init() and friends. + + Details: + - Fixed a couple of bugs that affected OpenMP and POSIX threads + configurations that resulted in compiler errors and warnings due + to type mismatch, and in the case of pthreads, a missing function + argument. The bugs are fairly recent, introduced in a017062. + +commit d52cb7671509592a8078729477b40b60380518a2 +Merge: 95abea4 c31b1e7 +Author: Field G. Van Zee +Date: Wed Jul 27 16:04:55 2016 -0500 + + Merge branch 'master' into compose + +commit c31b1e7b9d659b96433a87e5aecb90e457a104cc +Author: Field G. Van Zee +Date: Wed Jul 27 15:58:07 2016 -0500 + + Relax alignment restrictions for sandybridge ukrs. + + Details: + - Relaxed the base pointer and leading dimension alignment restrictions + in the sandybridge gemm microkernels, allowing the use of vmovups/vmovupd + instead of vmovaps/vmovapd. These change mimic those made to the haswell + microkernels in e0d2fa0 and ee2c139. + - Updated testsuite modules as well as standalone test drivers in 'test' + directory to use DBL_MAX as the initial time candidate. Thanks to Devin + Matthews for suggesting this change. + - Inserted #include "float.h" into bli_system.h (to gain access to DBL_MAX). + - Minor update (vis-a-vis contexts) to driver code in test/3m4m. + +commit 95abea46f86816fddfc9ff0abfa52880801461be +Merge: d0dfe5b a017062 +Author: Field G. Van Zee +Date: Sat Jul 23 15:38:33 2016 -0500 + + Merge branch 'master' into compose + +commit a017062fdf763037da9d971a028bb07d47aa1c8a +Author: Field G. Van Zee +Date: Fri Jul 22 17:02:59 2016 -0500 + + Integrated "memory broker" (membrk_t) abstraction. + + Details: + - Integrated a patch originally authored and submitted by Ricardo Magana + of HP Enterprise. The changeset inserts use of a new object type, membrk_t, + (memory broker) that allows multiple sets of memory pools on, for example, + separate NUMA nodes, each of which has a separate memory space. + - Added membrk field to cntx_t and defined corresponding accessor macros. + - Added membrk field to mem_t object and defined corresponding accessor macros. + - Created new bli_membrk.c file, which contains the new memory broker API, + including: + bli_membrk_init(), bli_membrk_finalize() + bli_membrk_acquire_[mv](), bli_membrk_release(), + bli_membrk_init_pools(), bli_membrk_reinit_pools(), + bli_membrk_finalize_pools(), + bli_membrk_pool_size() + - In bli_mem.c, changed function calls to + bli_mem_init_pools() -> bli_membrk_init() + bli_mem_reinit_pools() -> bli_membrk_reinit() + bli_mem_finalize_pools() -> bli_membrk_finalize() + - In bli_packv_init.c, bli_packm_init.c, changed function calls to: + bli_mem_acquire_[mv]() -> bli_membrk_acquire_[mv]() + bli_mem_release() -> bli_membrk_release() + - Added bli_mutex.c and related files to frame/thread. These files define + abstract mutexes (locks) and corresponding APIs for pthreads, openmp, or + single-threaded execution. This new API is employed within functions + such as bli_membrk_acquire_[mv]() and bli_membrk_release(). + +commit ce59f81108ec9aea918a7e77030da8acfdd397ce +Merge: ff41153 707a2b7 +Author: Field G. Van Zee +Date: Fri Jul 22 14:48:14 2016 -0500 + + Merge pull request #88 from devinamatthews/32bit-dim_t + + Handle 32-bit dim_t in 64-bit microkernels. + +commit 707a2b7faca137cca7cab7b11a12c44ddaf7ad53 +Author: Devin Matthews +Date: Fri Jul 22 13:49:44 2016 -0500 + + Somehow forgot the most important microkernel. + +commit 47ec045056351ac4f0791c071fa0daaa81699c8c +Merge: 08f1d6b ff41153 +Author: Devin Matthews +Date: Fri Jul 22 13:45:23 2016 -0500 + + Merge remote-tracking branch 'upstream/master' into 32bit-dim_t + +commit 08f1d6b6fa344275de0f675f69737145ccf6646a +Author: Devin Matthews +Date: Fri Jul 22 13:44:37 2016 -0500 + + Use 64-bit intermediate variable for k for architectures that do 64-bit loads in case dim_t is 32-bit. + +commit ff41153f4eb7f38ed94bdd9a3fd81fb979f3f401 +Merge: f9214ce e0d2fa0 +Author: Field G. Van Zee +Date: Fri Jul 22 13:21:03 2016 -0500 + + Merge pull request #86 from devinamatthews/haswell-vmovups + + Remove alignment restrictions on C in haswell kernel. + +commit e0d2fa0d835ab49366aeb790363bb2b571d36ed8 +Author: Devin Matthews +Date: Fri Jul 22 12:56:51 2016 -0500 + + Relax alignment restrictions for haswell sgemm. + +commit f9214ced97392861f5a0ea72abfcf6f41faf674c +Merge: 413d62a 08666ea +Author: Field G. Van Zee +Date: Fri Jul 22 12:16:39 2016 -0500 + + Merge pull request #85 from devinamatthews/qopenmp + + Change -openmp to -fopenmp for icc. + +commit ee2c139df6ad53c6aec8a67ab23b3b1912e8d259 +Author: Devin Matthews +Date: Fri Jul 22 12:06:03 2016 -0500 + + Remove alignment restrictions on C in haswell kernel. + +commit 08666eaa20d8a31f2f92f944e5bfa7c1558c53e4 +Author: Devin Matthews +Date: Fri Jul 22 11:07:34 2016 -0500 + + Change -openmp to -fopenmp for icc. + +commit d0dfe5b5372cc7558ee9c4104b29f82eecc7ed61 +Merge: 31def12 413d62a +Author: Field G. Van Zee +Date: Thu Jul 14 11:01:06 2016 -0500 + + Merge branch 'master' into compose + +commit 413d62aca28edabba56605a9f87d5b715831e1db +Author: Field G. Van Zee +Date: Tue Jul 12 15:02:52 2016 -0500 + + README update (use official ACM TOMS links). + +commit dfa431f696db2df4065ea454df268a2e0bc02eac +Author: Field G. Van Zee +Date: Tue Jul 12 14:21:19 2016 -0500 + + README update (BLIS2 TOMS article now in-print). + +commit 31def12e2629f187e40f93f6bae9e26a6c2660e2 +Author: Field G. Van Zee +Date: Thu Jun 30 15:19:20 2016 -0500 + + First phase of control tree redesign. + + Details: + - These changes constitute the first set of changes in preparation to + revamping the structure and use of control trees in BLIS. Modifications + in this commit don't affect the control tree code yet, but rather lay + the groundwork. + - Defined wrappers for the following functions, where the the wrappers + each take a direction parameter of a new enumerated type (BLIS_BWD or + BLIS_FWD), dir_t, and executes the correct underlying function. + - bli_acquire_mpart_*() and _vpart_*() + - bli_*_determine_kc_[fb]() + - bli_thread_get_range_*() and bli_thread_get_range_weighted_*() + - Consolidated all 'f' (forwards-moving) and 'b' (backwards-moving) + blocked variants for trmm and trsm, and renamed gemm and herk variants + accordingly. The direction is now queried via routines such as + bli_trmm_direct(), which deterines the direction from the implied side + and uplo parameters. For gemm and herk, it is uncondtionally BLIS_FWD. + - Defined wrappers to parameter-specific macrokernels for herk, trmm, and + trsm, e.g. bli_trmm_xx_ker_var2(), that execute the correct underlying + macrokernel based on the implied parameters. The same logic used to + choose the dir_t in _direct() functions is used here. + - Simplified the function pointer arrays in _int() functions given the + consolidation and dir_t querying mentioned above. + - Function signature (whitespace) reformatting for various functions. + - Removed old code in various 'old' directories. + +commit 232754feecf29452987666b9f5ebba2619bfd0b0 +Author: Field G. Van Zee +Date: Tue Jun 21 14:25:39 2016 -0500 + + Fixed compiler warning in rand[vm], randn[vm]. + + Details: + - Fixed compiler warnings about unused variables related to the disabling + of normalization in the structured cases of the rand[vm] and randn[vm] + operations. + +commit a89555d1605574f3685813dcc972b636dd61264d +Author: Field G. Van Zee +Date: Fri Jun 17 14:08:35 2016 -0500 + + Added randn[vm] operations, support in testsuite. + + Details: + - Defined a new randomization operation, randn, on vectors and matrices. + The randnv and randnm operations randomize each element of the target + object with values from a narrow range of values. Presently, those + values are all integer powers of two, but they do not need to be powers + of two in order to achieve the primary goal, which is to initialize + objects that can be operated on with plenty of precision "slack" + available to allow computations that avoid roundoff. Using this method + of randomization makes it much more likely that testsuite residuals of + properly-functioning operations are close to zero, if not exactly zero. + - Updated existing randomization operations randv and randm to skip + special diagonal handling and normalization for matrices with structure. + This is now handled by the testsuite modules by explicitly calling a + testsuite function that loads the diagonal (and scales off-diagonal + elements). + - Added support for randnv and randnm in the testsuite with a new switch + in input.general that universally toggles between use of the classic + randv/randm, which use real values on the interval [-1,1], and + randnv/randnm, which use only values from a narrow range. Currently, + the narrow range is: +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4, 2^-5, 2^-6}, as + well as 0.0. + - Updated testsuite modules so that a testsutie wrapper function is called + instead of directly calling the randomization operations (such as + bli_randv() and bli_randm()). This wrapper also takes a bool_t that + indicates whether the object's elements should be normalized. (NOTE: As + alluded to above, in the test modules of triangular solve operations such + as trsv and trsm, we perform the extra step of loading the diagonal.) + - Defined a new level-0 operation, invertsc, which inverts a scalar. + - Updated the abval2ris and sqrt2ris level-0 macros to avoid an unlikely + but possible divide-by-zero. + - Updated function signature and prototype formatting in testsuite. + +commit 096895c5d538a7f8817603d7cf28c52e99340def +Author: Field G. Van Zee +Date: Mon Jun 6 13:32:04 2016 -0500 + + Reorganized code, APIs related to multithreading. + + Details: + - Reorganized code and renamed files defining APIs related to multithreading. + All code that is not specific to a particular operation is now located in a + new directory: frame/thread. Code is now organized, roughly, by the + namespace to which it belongs (see below). + - Consolidated all operation-specific *_thrinfo_t object types into a single + thrinfo_t object type. Operation-specific level-3 *_thrinfo_t APIs were + also consolidated, leaving bli_l3_thrinfo_*() and bli_packm_thrinfo_*() + functions (aside from a few general purpose bli_thrinfo_*() functions). + - Renamed thread_comm_t object type to thrcomm_t. + - Renamed many of the routines and functions (and macros) for multithreading. + We now have the following API namespaces: + - bli_thrinfo_*(): functions related to thrinfo_t objects + - bli_thrcomm_*(): functions related to thrcomm_t objects. + - bli_thread_*(): general-purpose functions, such as initialization, + finalization, and computing ranges. (For now, some macros, such as + bli_thread_[io]broadcast() and bli_thread_[io]barrier() use the + bli_thread_ namespace prefix, even though bli_thrinfo_ may be more + appropriate.) + - Renamed thread-related macros so that they use a bli_ prefix. + - Renamed control tree-related macros so that they use a bli_ prefix (to be + consistent with the thread-related macros that were also renamed). + - Removed #undef BLIS_SIMD_ALIGN_SIZE from dunnington's bli_kernel.h. This + #undef was a temporary fix to some macro defaults which were being applied + in the wrong order, which was recently fixed. + +commit 232530e88ff99f37abcae5b6fb5319a9a375a45f +Merge: 4bcabd1 eef37f8 +Author: Tyler Michael Smith +Date: Wed Jun 1 15:14:10 2016 -0500 + + Merge commit 'refs/pull/81/head' of https://github.com/flame/blis + + Conflicts: + frame/base/bli_threading_pthreads.c + frame/base/bli_threading_pthreads.h + +commit 4bcabd1bf60688c38cf562459fc5e8be8b831756 +Author: Tyler Michael Smith +Date: Wed Jun 1 13:27:28 2016 -0500 + + Use spin locks instead of pthread barriers + +commit eef37f8b4d81845a6ba4bf25586d32b50c3e8a68 +Author: Jeff Hammond +Date: Sun May 29 22:28:13 2016 -0700 + + use GCC intrinsic instead of pthread_mutex for atomic increment and fetch + +commit 9dcd6f05c4c3ff2ce7cd87a9951a96ebef22681e +Author: Field G. Van Zee +Date: Tue May 24 13:15:32 2016 -0500 + + Implemented developer-configurable malloc()/free(). + + Details: + - Replaced all instances of bli_malloc() and bli_free() with one of: + - bli_malloc_pool()/bli_free_pool() + - bli_malloc_user()/bli_free_user() + - bli_malloc_intl()/bli_free_intl() + each of which can be configured to call malloc()/free() substitutes, + so long as the substitute functions have the same function type + signatures as malloc() and free() defined by C's stdlib.h. The _pool() + function is called when allocating blocks for the memory pools (used + for packing buffers, primarily), the _user() function is called when + obj_t's are created (via bli_obj_create() and friends), and the _intl() + function is called for internal use by BLIS, such as when creating + control tree nodes or temporary buffers for manipulating internal data + structures. Substitutes for any of the three types of bli_malloc() may + be specified by #defining the following pairs of cpp macros in + bli_kernel.h: + - BLIS_MALLOC_POOL/BLIS_FREE_POOL + - BLIS_MALLOC_USER/BLIS_FREE_USER + - BLIS_MALLOC_INTL/BLIS_FREE_INTL + to be the name of the substitute functions. (Obviously, the object + code that contains these functions must be provided at link-time.) + These macros default to malloc() and free(). Subsitute functions are + also automatically prototyped by BLIS (in bli_malloc_prototypes.h). + - Removed definitions for bli_malloc() and bli_free(). + - Note that bli_malloc_pool() and bli_malloc_user() are now defined in + terms of a new function, bli_malloc_align(), which aligns memory to an + arbitrary (power of two) alignment boundary, but does so manually, + whereas before alignment was performed behind the scenes by + posix_memalign(). Currently, bli_malloc_intl() is defined in terms + of bli_malloc_noalign(), which serves as a simple wrapper to the + designated function that is passed in (e.g. BLIS_MALLOC_INTL). + Similarly, there are bli_free_align() and bli_free_noalign(), which + are used in concert with their bli_malloc_*() counterparts. + +commit 9dd440109a9d964f5cd286e9f83c487ad703e1e4 +Author: Jeff Hammond +Date: Sat May 21 15:21:58 2016 -0700 + + fix 404 link to BuildSystem + + Google Code is dead. Long live GitHub! + +commit d309f20b7376a68efa3b864ad790c2021c071655 +Author: Field G. Van Zee +Date: Wed May 18 15:13:53 2016 -0500 + + Added alignment switch to testsuite. + + Details: + - Added a new input parameter to input.general that globally toggles + whether testsuite tests are performed on objects whose buffers and + leading dimensions have been aligned, and changed the implementation + of libblis_test_mobj_create() to employ alignment (or not) regardless + of whether row, column, or general storage is being tested. + - Updated configure script's "--help" text to indicate default behavior + for internal integer type size and BLAS/CBLAS integer type size + options. + +commit 32db0adc218ea4ae370164dbe8d23b41cd3526d3 +Author: Field G. Van Zee +Date: Tue May 17 15:20:16 2016 -0500 + + Generate prototypes for user-defined packm kernels. + + Details: + - Created template prototypes for packm kernels (in bli_l1m_ker.h), and + then redefined reference packm kernels' prototyping headers in terms of + this template, as is already done for level-1v, -1f, and -3 kernels. + - Automatically generate prototypes for user-defined packm kernels in + bli_kernel_prototypes.h (using the new template prototypes in + bli_l1m_ker.h). + - Defined packm kernel function types in bli_l1m_ft.h, including for + packm kernels specific to induced methods, which are now used in + bli_packm_cxk.c and friends rather than using a locally-defined + function type. + - In bli_packm_cxk.c, extended function pointer for packm kernels array + from out to index 31 (from previous maximum of 17). This allows us to + store the unrolled 30xk kernel in the array for use (on knc, for + example). Note: This should have been done a long time ago. + +commit 4bcf1b35abea3f3dfc8f2fe462dcf155cf199e55 +Author: Field G. Van Zee +Date: Wed May 11 16:09:49 2016 -0500 + + Fixed bli_get_range_*() bugs in trsm variants. + + Details: + - Fixed incorrect calls to bli_get_range_*() from within trsm blocked + variants 1f, 2b, and 2f. The bug somehow went undetected since the + big commit (537a1f4), and, strangely, did not manifest via the BLIS + testsuite. The bug finally came to our attention when running thei + libflame test suite while linking to BLIS. Thanks to Kiran Varaganti + for submitting the initial report that led to this bug. + +commit 9cfa33023f123a6c17e987f72fba174ce073f0b6 +Author: Field G. Van Zee +Date: Wed May 11 16:02:30 2016 -0500 + + Minor updates to bli_f2c.h. + + Details: + - Added #undef guards to certain #define statements in bli_f2c.h, + and renamed the file guard to BLIS_F2C_H. This helps when + #including "blis.h" from an application or library that already + #includes an "f2c.h" header. + +commit a09a2e23eacf5328858c8318bb637c5ff3b71d08 +Merge: 4dcd37e 7c604e1 +Author: Tyler Michael Smith +Date: Wed May 11 10:47:11 2016 -0500 + + Merge pull request #76 from devinamatthews/move_simd_defs + + Move default SIMD-related definitions to bli_kernel_macro_defs.h + +commit 4dcd37eb1b12a6e08cc13df7b61391ef8363f5d8 +Author: Tyler Smith +Date: Tue May 10 16:28:59 2016 -0500 + + fixing knc simd align size + +commit 7c604e1cbc1609b6e12d3ee973c08b7af5035be4 +Author: Devin Matthews +Date: Tue May 10 12:11:55 2016 -0500 + + Move default SIMD-related definitions to bli_kernel_macro_defs.h. Otherwise, configurations which customize these fail as these are now defined in bli_kernel.h. + +commit a7be2d28e8930b154d0da1d6929b54a96e210af6 +Merge: 97b512e 4b1e55e +Author: Field G. Van Zee +Date: Tue May 10 11:48:51 2016 -0500 + + Merge pull request #74 from devinamatthews/fix_common_symbols + + Default-initialize all extern global variables to avoid generating common symbols. + +commit 4b1e55edbfe0e1cb2e7b9428424903497cb7a841 +Author: Devin Matthews +Date: Tue May 10 10:08:47 2016 -0500 + + Default-initialize all extern global variables to avoid generating common symbols. Fixes #73. + +commit 97b512ef62c7e25c97ed5e9eca81cd7015b2ac91 +Author: Field G. Van Zee +Date: Fri May 6 10:24:30 2016 -0500 + + Include headers from cblas.h to pull in f77_int. + + Details: + - Added #include statements for certain key BLIS headers so that the + definition of f77_int is pulled in when a user compiles application + code with only #include "cblas.h" (and no other BLIS header). This + is necessary since f77_int is now used within the cblas API. + +commit c3a4d39d03665135f1616588b5ef7c3e9ef5688d +Author: Field G. Van Zee +Date: Wed May 4 17:22:56 2016 -0500 + + Updates to haswell gemm micro-kernels. + + Details: + - Added two new sets of [sd]gemm micro-kernels for haswell architectures, + one that is 4x24/4x12 (s and d) and one that is 6x16/6x8. + - Changed the haswell configuration to use the 6x16/6x8 micro-kernels + by default. + - Updated various Makefiles, in test, test/3m4m, and testsuite. + +commit 0b01d355ae861754ae2da6c9a545474af010f02e +Author: Field G. Van Zee +Date: Wed Apr 27 15:21:10 2016 -0500 + + Miscellaneous cleanups, fixes to recent commits. + + Details: + - Fixed a typo in bli_l1f_ref.h, introduced into bbb8569, that only + manifested when non-reference level-1f kernels were used. + - Added an #undef BLIS_SIMD_ALIGN_SIZE to bli_kernel.h of dunnington + configuration to prevent a compile-time warning until I can figure out + the proper permanent fix. + - Moved frame/1f/kernels/bli_dotxaxpyf_ref_var1.c out of the compilation + path (into 'other' directory). _ref_var2 is used by default, which is + the variant that is built on axpyf and dotxf instead of dotaxpyv. + - Removed section of frame/include/bli_config_macro_defs.h pertaining to + mixed datatype support. + +commit ed7326c836f427e2f8420b015220ce293207b10c +Author: Field G. Van Zee +Date: Wed Apr 27 14:57:40 2016 -0500 + + Added 'restrict' to l1v/l1f code in 'kernels' dir. + + Details: + - Added 'restrict' keyword to existing kernel definitions in 'kernels' + directory. These changes were meant for inclusion in bbb8569. + +commit bbb8569b2a08c3bcd631d5a05eb389d01d94ac07 +Author: Field G. Van Zee +Date: Wed Apr 27 14:13:46 2016 -0500 + + Use 'restrict' in all kernel APIs; wspace changes. + + Details: + - Updated level-1v, level-1f kernel function types (bli_l1?_ft.h) and + generic kernel prototypes (bli_l1?_ker.h) to use 'restrict' for all + numerical operand pointers (ie: all pointers except the cntx_t). + - Updated level-1f reference kernel definitions to use 'restrict' for + all numerical operand pointers. (Level-1v reference kernel definitions + were already updated in bdbda6e.) + - Rewrote the level-1v and level-1f reference kernel prototypes in + bli_l1v_ref.h and bli_l1f_ref.h, respectively, to simply #include + bli_l1v_ker.h and bli_l1f_ker.h with redefined function base names + (as was already being done for the level-3 micro-kernel prototypes + in bli_l3_ref.h), rather than duplicate the signatures from the + _ker.h files. + - Added definitions to frame/include/bli_kernel_prototypes.h for axpbyv + and xpbyv, which were probably meant for inclusion in bdbda6e. + - Converted a number of instances of four spaces, as introduced in + bdbda6e, to tabs. + +commit 4ea419c72c789825e1f93a1eee88219bbf873930 +Merge: f1e9be2 bdbda6e +Author: Field G. Van Zee +Date: Tue Apr 26 12:50:45 2016 -0500 + + Merge pull request #70 from devinamatthews/daxpby + + Give the level1v operations some love + +commit bdbda6e6acc682ab1b6ca680edebd09ae12a832c +Author: Devin Matthews +Date: Mon Apr 25 11:05:57 2016 -0500 + + Give the level1v operations some love: + + - Add missing axpby and xpby operations (plus test cases). + - Add special case for scal2v with alpha=1. + - Add restrict qualifiers. + - Add special-case algorithms for incx=incy=1. + +commit f1e9be2aba1a057eedb947bbae96848597777408 +Author: Field G. Van Zee +Date: Fri Apr 22 15:34:02 2016 -0500 + + Minor tweak to test/Makefile. + + Details: + - Just committing a minor change to test/Makefile that has been lingering + in my local working copy for longer than I can remember. + +commit aa0bceec277938328dabeb744680623f24fb0b61 +Merge: 4136553 e2784b4 +Author: Field G. Van Zee +Date: Fri Apr 22 12:01:31 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4136553f0d0661a668dfdb9edcd7ce1c5773dde7 +Author: Field G. Van Zee +Date: Fri Apr 22 11:53:53 2016 -0500 + + Clear level-3 cntx_t's via memset() before use. + + Details: + - In all level-3 operations' _cntx_init() functions, replaced calls to + bli_cntx_obj_init() with calls to bli_cntx_obj_clear(), and in all + level-3 operations' _cntx_finalize() functions, removed calls to + bli_cntx_obj_finalize(), leaving those function definitions empty. + - Changed the definition of bli_cntx_obj_clear() so that the clearing + occurs via a single call to memset(). + +commit e2784b4c921f706e756df3e146e20a4cb63f53e3 +Merge: dd0ab1d a9b6c3a +Author: Field G. Van Zee +Date: Wed Apr 20 18:34:09 2016 -0500 + + Merge pull request #67 from devinamatthews/cblas-f77-int + + Change CBLAS integer type to f77_int + +commit a9b6c3abda6222a8b240361643932e83cf726c4f +Merge: e4c54c8 dd0ab1d +Author: Devin Matthews +Date: Wed Apr 20 16:00:10 2016 -0500 + + Merge remote-tracking branch 'origin/master' into cblas-f77-int + + # Conflicts: + # config/haswell/bli_config.h + +commit e4c54c81463c2a19c9bb6b1f0f1be3fa9d018a45 +Author: Devin Matthews +Date: Wed Apr 20 15:56:46 2016 -0500 + + Change integer type in CBLAS function signatures to f77_int, and add proper const-correctness to BLAS layer. + +commit dd0ab1d93f33abca6af9edd7b8e52da62dcfa5b1 +Author: Field G. Van Zee +Date: Wed Apr 20 14:38:23 2016 -0500 + + Converted some bli_cntx query functions to macros. + + Details: + - Commented out several datatype-aware query functions (those ending in + _dt) from bli_cntx.c, as well as their prototypes in bli_cntx.h, and + added equivalent cpp query macros to bli_cntx.h. + - Added 'bli_config.h' to .gitignore. + +commit a30ccbc4c6a6e6460e78af6b5c530ee0d06f98fb +Merge: eb2f18e 0e1a982 +Author: Field G. Van Zee +Date: Tue Apr 19 15:04:33 2016 -0500 + + Merge pull request #66 from devinamatthews/blas-configure + + Add configure options and generate bli_config.h automatically. + +commit eb2f18e4844d985715df20798f50f9cc12e3b5ad +Author: Field G. Van Zee +Date: Tue Apr 19 12:50:32 2016 -0500 + + More compile-time fixes to bgq gemm ukernel code. + +commit 0e1a9821d860f6c1d818baf4c48d21a23726c132 +Author: Devin Matthews +Date: Tue Apr 19 11:44:37 2016 -0500 + + Add configure options and generate bli_config.h automatically. + + Options to configure have been added for: + - Setting the internal BLIS and BLAS/CBLAS integer sizes. + - Enabling and disabling the BLAS and CBLAS layers. + + Additionally, configure options which require defining macros (the above plus the threading model), write their macros to the automatically-generated bli_config.h file in the top-level build directory. The old bli_config.h files in the config dirs were removed, and any kernel-related macros (SIMD size and alignment etc.) were moved to bli_kernel.h. The Makefiles were also modified to find the new bli_config.h file. + + Lastly, support for OMP in clang has been added (closes #56). + +commit ff84469a4575f1ef8a0010046fde52240a312cae +Author: Field G. Van Zee +Date: Mon Apr 18 12:29:09 2016 -0500 + + Applied various compilation fixes to bgq kernels. + +commit cbcd0b739dc54bd14fbb46aeda267c26725cd70f +Author: Tyler Michael Smith +Date: Mon Apr 18 03:12:57 2016 -0500 + + Changing ifdef for OSX pthread barriers + +commit dd62080cea78f3a23616200d6640e52c102b2bb9 +Author: Field G. Van Zee +Date: Fri Apr 15 11:15:41 2016 -0500 + + Compile-time fix to bgq l1f kernels. + + Details: + - Fixed an old reference to bli_daxpyf_fusefac, which no longer exists, + by replacing it with the axpyf fusing factor (8), and cleaned up the + relevant section of config/bgq/bli_kernel.h. + - Removed most of the details of the level-3 kernels from the template + kernel code in config/template/kernels/3 and replaced it with a + reference to the relevant kernel wiki maintained on the BLIS github + website. + +commit d5a915dd8d7a6ead42a68772e4420eb3647e6f1a +Merge: 4320b72 4169467 +Author: Field G. Van Zee +Date: Thu Apr 14 12:56:36 2016 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 4320b725a1f8fd34101470b6cf52ad504a79c517 +Author: Field G. Van Zee +Date: Thu Apr 14 12:51:29 2016 -0500 + + Use kernel CFLAGS on "ukernels" directories. + + Details: + - Updated the top-level Makefile so that the CFLAGS variable designated + for kernel source code is applied not only to source code in + directories named "kernels" but source code in any directory that + contains the substring "kernels", such as "ukernels". + - Formally disabled some code in gen-make-frag.sh script that was already + effectively disabled. The code was related to handling "noopt" and + "kernel" directories, which is now handled independently within the + top-level Makefile without needing to place these source files into + a spearate makefile variable. + +commit 41694675e4cb56e2e0323c7a7db48e0819606a31 +Author: Tyler Smith +Date: Wed Apr 13 15:51:08 2016 -0500 + + pthreads bugfixes + + Getting pthreads to work on my Mac + Implemented a pthread barrier when _POSIX_BARRIER isn't defined + Now spawn n-1 threads instead of n threads so that master thread isn't just spinning the whole time + Add -lpthread instead of -pthread to LDFLAGS (for clang) + +commit f756dbfa0d542cbc497724981520c83abf049c4b +Author: Field G. Van Zee +Date: Wed Apr 13 11:25:33 2016 -0500 + + Removed stale #include from bgq configuration. + + Details: + - Removed an old #include statement ("bli_gemm_8x8.h") from the + bli_kernel.h file in the bgq configuration. It turns out this + file was no longer needed even prior to 537a1f4. + +commit 0bd4169ea75f690714e7d2912229932a75d8a7e2 +Author: Field G. Van Zee +Date: Mon Apr 11 18:08:32 2016 -0500 + + Fixed context-broken dunnington/penryn kernels. + + Details: + - Added missing context parameters to several instances where simpler + kernels, or reference kernels, are called instead of executing the + main body code contained in the kernel function in question. + - Renamed axpyv and dotv kernel files to use "opt" instead of "int" + substring, for consistency with level-1f kernels. + +commit 7912af5db45b7372d19a9a3dfeb82df302a05628 +Author: Field G. Van Zee +Date: Mon Apr 11 17:32:13 2016 -0500 + + CHANGELOG update (0.2.0) + +commit 898614a555ea0aa7de4ca07bb3cb8f5708b6a002 (tag: 0.2.0) Author: Field G. Van Zee Date: Mon Apr 11 17:32:09 2016 -0500 @@ -132,7 +1182,7 @@ Date: Mon Apr 11 17:21:28 2016 -0500 that this does not preclude supporting mixed types via the object APIs, where it produces absolutely zero API code bloat. -commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 (origin/master) +commit d1f8e5d9b2ecd054ed103f4d642d748db2d4f173 Merge: 20af937 c11d28e Author: Field G. Van Zee Date: Tue Apr 5 12:21:27 2016 -0500 @@ -2384,8 +3434,8 @@ Date: Wed Aug 20 14:44:51 2014 -0500 Merge branch 'master' of http://github.com/flame/blis Conflicts: - frame/3/trsm/bli_trsm_blk_var2b.c - frame/3/trsm/bli_trsm_blk_var2f.c + frame/3/trsm/bli_trsm_blk_var2b.c + frame/3/trsm/bli_trsm_blk_var2f.c commit 699a8151ca3d5021e834a1784ef45dcc3a3d17cd Author: Tyler Smith @@ -3492,8 +4542,8 @@ Date: Fri Apr 4 09:54:54 2014 -0500 Merge http://github.com/flame/blis Conflicts: - kernels/bgq/1/bli_axpyv_opt_var1.c - kernels/bgq/1/bli_dotv_opt_var1.c + kernels/bgq/1/bli_axpyv_opt_var1.c + kernels/bgq/1/bli_dotv_opt_var1.c commit 4e3eb39aca4df0b9fdc003d468f368a2f2ba597d Author: Tyler Michael Smith @@ -3793,7 +4843,7 @@ Date: Thu Feb 27 16:46:23 2014 -0600 Merge https://github.com/flame/blis Conflicts: - frame/1m/packm/bli_packm_blk_var1.c + frame/1m/packm/bli_packm_blk_var1.c commit e8757b03a74f9891632242e9a90efb32150826f5 Author: Field G. Van Zee diff --git a/Makefile b/Makefile index 5ac386fec..1a4868eaa 100644 --- a/Makefile +++ b/Makefile @@ -312,6 +312,19 @@ MK_BLIS_CONFIG_OBJS += $(patsubst $(CONFIG_PATH)/%.c, $(BASE_OBJ_CONFIG_PATH)/% MK_ALL_BLIS_OBJS := $(MK_BLIS_CONFIG_OBJS) \ $(MK_BLIS_FRAME_OBJS) +# Optionally filter out the BLAS and CBLAS compatibility layer object files. +# This is not actually necessary, since each affected file is guarded by C +# preprocessor macros, but it but prevents "empty" object files from being +# added into the library (and reduces compilation time). +BASE_OBJ_BLAS_PATH := $(BASE_OBJ_FRAME_PATH)/compat +BASE_OBJ_CBLAS_PATH := $(BASE_OBJ_FRAME_PATH)/compat/cblas +ifeq ($(BLIS_ENABLE_CBLAS),no) +MK_ALL_BLIS_OBJS := $(filter-out $(BASE_OBJ_CBLAS_PATH)/%.o, $(MK_ALL_BLIS_OBJS) ) +endif +ifeq ($(BLIS_ENABLE_BLAS2BLIS),no) +MK_ALL_BLIS_OBJS := $(filter-out $(BASE_OBJ_BLAS_PATH)/%.o, $(MK_ALL_BLIS_OBJS) ) +endif + # diff --git a/README.md b/README.md index 7142a1329..9bfa84285 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,17 @@ Introduction ------------ BLIS is a portable software framework for instantiating high-performance -BLAS-like dense linear algebra libraries. The framework was designed to -isolate essential kernels of computation that, when optimized, immediately -enable optimized implementations of most of its commonly used and -computationally intensive operations. BLIS is written in [ISO +BLAS-like dense linear algebra libraries. The framework was designed to isolate +essential kernels of computation that, when optimized, immediately enable +optimized implementations of most of its commonly used and computationally +intensive operations. BLIS is written in [ISO C99](http://en.wikipedia.org/wiki/C99) and available under a [new/modified/3-clause BSD license](http://opensource.org/licenses/BSD-3-Clause). While BLIS exports a -[new BLAS-like API](), it also includes a BLAS compatibility layer which gives -application developers access to BLIS implementations via traditional [BLAS -routine calls](http://www.netlib.org/lapack/lug/node145.html). +[new BLAS-like API](https://github.com/flame/blis/wiki/BLISAPIQuickReference), +it also includes a BLAS compatibility layer which gives application developers +access to BLIS implementations via traditional [BLAS routine +calls](http://www.netlib.org/lapack/lug/node145.html). For a thorough presentation of our framework, please read our recently accepted journal article, ["BLIS: A Framework for Rapidly Instantiating BLAS @@ -262,11 +263,17 @@ A fourth paper, submitted to ACM TOMS, also exists, which proposes an ``` @article{BLIS4, - author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and Enrique S. Quintana-Ort\'{\i}}, - title = {Analytical Models for the {BLIS} Framework}, + author = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and + Enrique S. Quintana-Ort\'{\i}}, + title = {Analytical Modeling Is Enough for High-Performance {BLIS}}, journal = {ACM Transactions on Mathematical Software}, - year = 2016, - note = {Accepted}, + volume = {43}, + number = {2}, + pages = {12:1--12:18}, + month = aug, + year = {2016}, + issue_date = {August 2016}, + url = {http://doi.acm.org/10.1145/2925987}, } ``` diff --git a/build/config.mk.in b/build/config.mk.in index 8bdb427a0..9d92f7fb4 100644 --- a/build/config.mk.in +++ b/build/config.mk.in @@ -64,5 +64,9 @@ BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@ BLIS_ENABLE_STATIC_BUILD := @enable_static@ BLIS_ENABLE_DYNAMIC_BUILD := @enable_dynamic@ +# The status of BLAS and CBLAS compatibility layers +BLIS_ENABLE_BLAS2BLIS := @enable_blas2blis@ +BLIS_ENABLE_CBLAS := @enable_cblas@ + # end of ifndef CONFIG_MK_INCLUDED conditional block endif diff --git a/common.mk b/common.mk index 458cdcc03..683d0b0e9 100644 --- a/common.mk +++ b/common.mk @@ -153,9 +153,9 @@ endif ifeq ($(CC_VENDOR),gcc) ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := omp +THREADING_MODEL := openmp endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif @@ -167,9 +167,9 @@ endif ifeq ($(CC_VENDOR),icc) ifeq ($(THREADING_MODEL),auto) -THREADING_MODEL := omp +THREADING_MODEL := openmp endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif @@ -183,7 +183,7 @@ ifeq ($(CC_VENDOR),clang) ifeq ($(THREADING_MODEL),auto) THREADING_MODEL := pthreads endif -ifeq ($(THREADING_MODEL),omp) +ifeq ($(THREADING_MODEL),openmp) CTHREADFLAGS := -fopenmp LDFLAGS += -fopenmp endif diff --git a/config/haswell/bli_kernel.h b/config/haswell/bli_kernel.h index 24a3c68b1..d23a00a5d 100644 --- a/config/haswell/bli_kernel.h +++ b/config/haswell/bli_kernel.h @@ -49,26 +49,27 @@ // (b) MR (for zero-padding purposes when MR and NR are "swapped") // -#if 0 +// sgemm micro-kernel +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4 #define BLIS_DEFAULT_MC_S 264 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 24 #define BLIS_DEFAULT_NR_S 4 +#endif -#else - -/* +#if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 -*/ +#endif +#if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 @@ -77,29 +78,29 @@ #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS - #endif -#if 0 +// dgemm micro-kernel +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 192 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 12 #define BLIS_DEFAULT_NR_D 4 +#endif -#else - -/* +#if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 -*/ +#endif +#if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 @@ -108,10 +109,33 @@ #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS - - #endif +// cgemm micro-kernel + +#if 1 +#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 +#define BLIS_DEFAULT_MC_C 144 +#define BLIS_DEFAULT_KC_C 256 +#define BLIS_DEFAULT_NC_C 4080 +#define BLIS_DEFAULT_MR_C 3 +#define BLIS_DEFAULT_NR_C 8 + +#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif + +// zgemm micro-kernel + +#if 1 +#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 +#define BLIS_DEFAULT_MC_Z 72 +#define BLIS_DEFAULT_KC_Z 256 +#define BLIS_DEFAULT_NC_Z 4080 +#define BLIS_DEFAULT_MR_Z 3 +#define BLIS_DEFAULT_NR_Z 4 + +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#endif diff --git a/configure b/configure index 8af3bde66..3a1e296a7 100755 --- a/configure +++ b/configure @@ -91,7 +91,7 @@ print_usage() echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " echo " Enable threading in the library, using threading model" - echo " MODEL={omp,pthreads,no}. If MODEL=no or " + echo " MODEL={openmp,pthreads,no}. If MODEL=no or " echo " --disable-threading is specified, threading will be" echo " disabled. The default is 'no'." echo " " @@ -424,6 +424,14 @@ main() echo "${script_name}: manual configuration requested." config_name=$1 + + # Ensure configuration is valid. + if [ ! -d "${config_dirpath}/${config_name}" ]; then + echo "${script_name}: " + echo "${script_name}: *** configuration '${config_name}' does not exist. ***" + echo "${script_name}: " + exit 1; + fi fi echo "${script_name}: configuring with '${config_name}' configuration sub-directory." @@ -486,17 +494,26 @@ main() # Check the threading model flag. - enable_openmp=0 - enable_pthreads=0 + # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. + enable_openmp='no' + enable_openmp_01=0 + enable_pthreads='no' + enable_pthreads_01=0 if [ "x${threading_model}" = "xauto" ]; then echo "${script_name}: determining the threading model automatically." - elif [ "x${threading_model}" = "xomp" ]; then + elif [ "x${threading_model}" = "xopenmp" ] || + [ "x${threading_model}" = "xomp" ]; then echo "${script_name}: using OpenMP for threading." - enable_openmp=1 - elif [ "x${threading_model}" = "xpthreads" ]; then + enable_openmp='yes' + enable_openmp_01=1 + elif [ "x${threading_model}" = "xpthreads" ] || + [ "x${threading_model}" = "xpthread" ] || + [ "x${threading_model}" = "xposix" ]; then echo "${script_name}: using Pthreads for threading." - enable_pthreads=1 - elif [ "x${threading_model}" = "xno" ]; then + enable_pthreads='yes' + enable_pthreads_01=1 + elif [ "x${threading_model}" = "xno" ] || + [ "x${threading_model}" = "xnone" ]; then echo "${script_name}: threading is disabled." else echo "Unsupported threading model: ${threading_model}." @@ -507,19 +524,19 @@ main() # Convert 'yes' and 'no' flags to booleans. if [ "x${enable_cblas}" = "xyes" ]; then echo "${script_name}: the CBLAS compatibility layer is enabled." - enable_cblas=1 + enable_cblas_01=1 # Force BLAS layer when CBLAS is enabled - enable_blas='yes' + enable_blas2blis='yes' else echo "${script_name}: the CBLAS compatibility layer is disabled." - enable_cblas=0 + enable_cblas_01=0 fi if [ "x${enable_blas2blis}" = "xyes" ]; then echo "${script_name}: the BLAS compatibility layer is enabled." - enable_blas2blis=1 + enable_blas2blis_01=1 else echo "${script_name}: the BLAS compatibility layer is disabled." - enable_blas2blis=0 + enable_blas2blis_01=0 fi @@ -561,6 +578,8 @@ main() | sed "s/@enable_static@/${enable_static}/g" \ | sed "s/@enable_dynamic@/${enable_shared}/g" \ | sed "s/@threading_model@/${threading_model}/g" \ + | sed "s/@enable_blas2blis@/${enable_blas2blis}/g" \ + | sed "s/@enable_cblas@/${enable_cblas}/g" \ > "${config_mk_out_path}" @@ -568,12 +587,12 @@ main() # to bli_config_h_out. echo "${script_name}: creating ${bli_config_h_out_path} from ${bli_config_h_in_path}" cat "${bli_config_h_in_path}" \ - | sed "s/@enable_openmp@/${enable_openmp}/g" \ - | sed "s/@enable_pthreads@/${enable_pthreads}/g" \ + | sed "s/@enable_openmp@/${enable_openmp_01}/g" \ + | sed "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed "s/@int_type_size@/${int_type_size}/g" \ | sed "s/@blas2blis_int_type_size@/${blas2blis_int_type_size}/g" \ - | sed "s/@enable_blas2blis@/${enable_blas2blis}/g" \ - | sed "s/@enable_cblas@/${enable_cblas}/g" \ + | sed "s/@enable_blas2blis@/${enable_blas2blis_01}/g" \ + | sed "s/@enable_cblas@/${enable_cblas_01}/g" \ > "${bli_config_h_out_path}" diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c index da47a6fd5..fc1c4c71a 100644 --- a/frame/0/bli_l0_check.c +++ b/frame/0/bli_l0_check.c @@ -99,8 +99,8 @@ void bli_getsc_check // Check object datatypes. - e_val = bli_check_noninteger_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_noninteger_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. @@ -125,8 +125,8 @@ void bli_setsc_check // Check object datatypes. - e_val = bli_check_floating_object( chi ); - bli_check_error_code( e_val ); + //e_val = bli_check_floating_object( chi ); + //bli_check_error_code( e_val ); // Check object dimensions. diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c index d20f8ea45..3858e05b7 100644 --- a/frame/0/bli_l0_oapi.c +++ b/frame/0/bli_l0_oapi.c @@ -198,8 +198,8 @@ void PASTEMAC0(opname) \ if ( bli_is_constant( dt_chi ) ) dt_use = dt_def; \ else dt_use = dt_chi; \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_use, \ opname, \ @@ -229,8 +229,8 @@ void PASTEMAC0(opname) \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ \ - /* Invoke the typed function. */ \ - bli_call_ft_3 \ + /* Invoke the typed function (with integer support). */ \ + bli_call_ft_3i \ ( \ dt_chi, \ opname, \ diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c index 53f5be271..028a12cbd 100644 --- a/frame/0/bli_l0_tapi.c +++ b/frame/0/bli_l0_tapi.c @@ -227,3 +227,25 @@ void PASTEMAC(ch,opname) \ INSERT_GENTFUNCR_BASIC0( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ) +{ + PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i ); +} + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ) +{ + PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi ); +} + diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h index 678e27292..36b282824 100644 --- a/frame/0/bli_l0_tapi.h +++ b/frame/0/bli_l0_tapi.h @@ -141,3 +141,19 @@ void PASTEMAC(ch,opname) \ INSERT_GENTPROTR_BASIC( zipsc ) +// ----------------------------------------------------------------------------- + +void bli_igetsc + ( + dim_t* chi, + double* zeta_r, + double* zeta_i + ); + +void bli_isetsc + ( + double zeta_r, + double zeta_i, + dim_t* chi + ); + diff --git a/frame/1/bli_l1v.h b/frame/1/bli_l1v.h index f557118f0..bd4879247 100644 --- a/frame/1/bli_l1v.h +++ b/frame/1/bli_l1v.h @@ -46,12 +46,14 @@ #include "bli_l1v_tapi.h" // Pack-related -#include "bli_packv.h" -#include "bli_unpackv.h" +// NOTE: packv and unpackv are temporarily disabled. +//#include "bli_packv.h" +//#include "bli_unpackv.h" // Other -#include "bli_scalv_cntl.h" -#include "bli_scalv_int.h" +// NOTE: scalv control tree code is temporarily disabled. +//#include "bli_scalv_cntl.h" +//#include "bli_scalv_int.h" // Reference kernel headers #include "bli_l1v_ref.h" diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c index b998a65fb..54c856b45 100644 --- a/frame/1/bli_l1v_check.c +++ b/frame/1/bli_l1v_check.c @@ -56,6 +56,21 @@ GENFRONT( subv ) GENFRONT( swapv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + bli_l1v_xi_check( x, index ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -481,3 +496,39 @@ void bli_l1v_ax_check bli_check_error_code( e_val ); } +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ) +{ + err_t e_val; + + // Check object datatypes. + + e_val = bli_check_floating_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_integer_object( index ); + bli_check_error_code( e_val ); + + e_val = bli_check_nonconstant_object( index ); + bli_check_error_code( e_val ); + + // Check object dimensions. + + e_val = bli_check_vector_object( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_scalar_object( index ); + bli_check_error_code( e_val ); + + // Check object buffers (for non-NULLness). + + e_val = bli_check_object_buffer( x ); + bli_check_error_code( e_val ); + + e_val = bli_check_object_buffer( index ); + bli_check_error_code( e_val ); +} + diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h index d4a1e9ff9..ddfe6a050 100644 --- a/frame/1/bli_l1v_check.h +++ b/frame/1/bli_l1v_check.h @@ -44,7 +44,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( addv ) GENTPROT( copyv ) @@ -52,6 +52,18 @@ GENTPROT( subv ) GENTPROT( swapv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,_check) \ + ( \ + obj_t* x, \ + obj_t* index \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ @@ -74,7 +86,7 @@ void PASTEMAC(opname,_check) \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ - ); + ); GENTPROT( axpyv ) GENTPROT( scal2v ) @@ -88,7 +100,7 @@ void PASTEMAC(opname,_check) \ obj_t* x, \ obj_t* y, \ obj_t* rho \ - ); + ); GENTPROT( dotv ) @@ -103,7 +115,7 @@ void PASTEMAC(opname,_check) \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ - ); + ); GENTPROT( dotxv ) @@ -114,7 +126,7 @@ GENTPROT( dotxv ) void PASTEMAC(opname,_check) \ ( \ obj_t* x \ - ); + ); GENTPROT( invertv ) @@ -126,7 +138,7 @@ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ - ); + ); GENTPROT( scalv ) GENTPROT( setv ) @@ -196,3 +208,9 @@ void bli_l1v_ax_check obj_t* x ); +void bli_l1v_xi_check + ( + obj_t* x, + obj_t* index + ); + diff --git a/frame/1/bli_l1v_cntx.c b/frame/1/bli_l1v_cntx.c index a1bba0354..bdbb0063f 100644 --- a/frame/1/bli_l1v_cntx.c +++ b/frame/1/bli_l1v_cntx.c @@ -55,6 +55,7 @@ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ) \ } GENFRONT( addv, BLIS_ADDV_KER ) +GENFRONT( amaxv, BLIS_AMAXV_KER ) GENFRONT( copyv, BLIS_COPYV_KER ) GENFRONT( dotv, BLIS_DOTV_KER ) GENFRONT( dotxv, BLIS_DOTXV_KER ) diff --git a/frame/1/bli_l1v_cntx.h b/frame/1/bli_l1v_cntx.h index a8c16d342..95cd4a131 100644 --- a/frame/1/bli_l1v_cntx.h +++ b/frame/1/bli_l1v_cntx.h @@ -44,6 +44,7 @@ void PASTEMAC(opname,_cntx_init)( cntx_t* cntx ); \ void PASTEMAC(opname,_cntx_finalize)( cntx_t* cntx ); GENPROT( addv ) +GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( copyv ) diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h index c4e206df7..b2b80e016 100644 --- a/frame/1/bli_l1v_ft.h +++ b/frame/1/bli_l1v_ft.h @@ -58,6 +58,21 @@ INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) +// amaxv + +#undef GENTDEF +#define GENTDEF( ctype, ch, opname, tsuf ) \ +\ +typedef void (*PASTECH2(ch,opname,tsuf)) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); + +INSERT_GENTDEF( amaxv ) + // axpbyv #undef GENTDEF diff --git a/frame/1/bli_l1v_ker.h b/frame/1/bli_l1v_ker.h index cf80eda46..8039905b7 100644 --- a/frame/1/bli_l1v_ker.h +++ b/frame/1/bli_l1v_ker.h @@ -54,6 +54,20 @@ INSERT_GENTPROT_BASIC( copyv_ker_name ) INSERT_GENTPROT_BASIC( subv_ker_name ) +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_ker_name ) + + #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c index cebc3bfb5..67525d68c 100644 --- a/frame/1/bli_l1v_oapi.c +++ b/frame/1/bli_l1v_oapi.c @@ -82,6 +82,44 @@ GENFRONT( copyv ) GENFRONT( subv ) +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ) \ +{ \ + BLIS_OAPI_CNTX_DECL \ +\ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + PASTEMAC(opname,_check)( x, index ); \ +\ + /* Invoke the typed function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + opname, \ + n, \ + buf_x, incx, \ + buf_index, \ + cntx \ + ); \ +} + +GENFRONT( amaxv ) + + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h index ff277421c..1c7e534da 100644 --- a/frame/1/bli_l1v_oapi.h +++ b/frame/1/bli_l1v_oapi.h @@ -52,6 +52,19 @@ GENTPROT( copyv ) GENTPROT( subv ) +#undef GENTPROT +#define GENTPROT( opname ) \ +\ +void PASTEMAC(opname,EX_SUF) \ + ( \ + obj_t* x, \ + obj_t* index \ + BLIS_OAPI_CNTX_PARAM \ + ); + +GENTPROT( amaxv ) + + #undef GENTPROT #define GENTPROT( opname ) \ \ diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c index 4cf6be24e..74a548eea 100644 --- a/frame/1/bli_l1v_tapi.c +++ b/frame/1/bli_l1v_tapi.c @@ -74,6 +74,38 @@ INSERT_GENTFUNC_BASIC( copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER ) +#undef GENTFUNC +#define GENTFUNC( ctype, ch, opname, kerid ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* index, \ + cntx_t* cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + cntx_t* cntx_p; \ +\ + bli_cntx_init_local_if( opname, cntx, cntx_p ); \ +\ + PASTECH2(ch,opname,_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx_p ); \ +\ + f \ + ( \ + n, \ + x, incx, \ + index, \ + cntx_p \ + ); \ +\ + bli_cntx_finalize_local_if( opname, cntx ); \ +} + +INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER ) + + #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h index b4b36b059..86cdf416d 100644 --- a/frame/1/bli_l1v_tapi.h +++ b/frame/1/bli_l1v_tapi.h @@ -40,6 +40,9 @@ #undef addv_ker_name #define addv_ker_name addv +#undef amaxv_ker_name +#define amaxv_ker_name amaxv + #undef axpbyv_ker_name #define axpbyv_ker_name axpbyv diff --git a/frame/1/kernels/bli_amaxv_ref.c b/frame/1/kernels/bli_amaxv_ref.c new file mode 100644 index 000000000..f207b799f --- /dev/null +++ b/frame/1/kernels/bli_amaxv_ref.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// +// Define BLAS-like interfaces with typed operands. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + if ( incx == 1 ) \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ + else \ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_ref ) + diff --git a/frame/1/packv/bli_packv.c b/frame/1/other/packv/bli_packv.c similarity index 100% rename from frame/1/packv/bli_packv.c rename to frame/1/other/packv/bli_packv.c diff --git a/frame/1/packv/bli_packv.h b/frame/1/other/packv/bli_packv.h similarity index 100% rename from frame/1/packv/bli_packv.h rename to frame/1/other/packv/bli_packv.h diff --git a/frame/1/packv/bli_packv_check.c b/frame/1/other/packv/bli_packv_check.c similarity index 100% rename from frame/1/packv/bli_packv_check.c rename to frame/1/other/packv/bli_packv_check.c diff --git a/frame/1/packv/bli_packv_check.h b/frame/1/other/packv/bli_packv_check.h similarity index 100% rename from frame/1/packv/bli_packv_check.h rename to frame/1/other/packv/bli_packv_check.h diff --git a/frame/1/packv/bli_packv_cntl.c b/frame/1/other/packv/bli_packv_cntl.c similarity index 75% rename from frame/1/packv/bli_packv_cntl.c rename to frame/1/other/packv/bli_packv_cntl.c index ac068ce71..13f90a429 100644 --- a/frame/1/packv/bli_packv_cntl.c +++ b/frame/1/other/packv/bli_packv_cntl.c @@ -34,6 +34,7 @@ #include "blis.h" +#if 0 packv_t* packv_cntl = NULL; void bli_packv_cntl_init( void ) @@ -77,4 +78,41 @@ void bli_packv_cntl_obj_init( packv_t* cntl, cntl->bmid = bmid; cntl->pack_schema = pack_schema; } +#endif + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + packv_params_t* params; + + // Allocate a packv_params_t struct. + params = bli_malloc_intl( sizeof( packv_params_t ) ); + + // Initialize the packv_params_t struct. + params->size = sizeof( packv_params_t ); + params->packv_var_func = packv_var_func; + params->bmid = bmid; + params->pack_schema = pack_schema; + + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); + + return cntl; +} diff --git a/frame/1/other/packv/bli_packv_cntl.h b/frame/1/other/packv/bli_packv_cntl.h new file mode 100644 index 000000000..1fc265338 --- /dev/null +++ b/frame/1/other/packv/bli_packv_cntl.h @@ -0,0 +1,67 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +struct packv_params_s +{ + uint64_t size + packv_voft* var_func; + bszid_t bmid; + pack_t pack_schema; +}; +typedef struct packv_params_s packv_params_t; + + +#define bli_cntl_packv_params_var_func( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->var_func ) + +#define bli_cntl_packv_params_bmid( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->bmid_m ) + +#define bli_cntl_packv_params_pack_schema( cntl ) \ +\ + ( (packv_params_t*)( cntl->params )->pack_schema ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packv_cntl_obj_create + ( + void* var_func, + void* packv_var_func, + bszid_t bmid, + pack_t pack_schema, + cntl_t* sub_node + ); + diff --git a/frame/1/packv/bli_packv_init.c b/frame/1/other/packv/bli_packv_init.c similarity index 65% rename from frame/1/packv/bli_packv_init.c rename to frame/1/other/packv/bli_packv_init.c index 5d8a10b98..01b8f3cdd 100644 --- a/frame/1/packv/bli_packv_init.c +++ b/frame/1/other/packv/bli_packv_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -43,6 +44,7 @@ void bli_packv_init ) { // The purpose of packm_init() is to initialize an object P so that + // a source object A can be packed into P via one of the packv // implementations. This initialization includes acquiring a suitable // block of memory from the memory allocator, if such a block of memory @@ -50,7 +52,6 @@ void bli_packv_init pack_t pack_schema; bszid_t bmult_id; - obj_t c; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -82,26 +83,6 @@ void bli_packv_init // left is whether we are to typecast vector a before packing. if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast vector. - bli_packv_init_cast( a, - p, - &c ); - - // Copy/typecast vector a to vector c. - bli_copyv( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // vector serves as a minor optimization. This causes the packv - // implementation to pack directly from vector a. - bli_obj_alias_to( *a, c ); - } - // Extract various fields from the control tree and pass them in // explicitly into _init_pack(). This allows external code generators @@ -114,7 +95,7 @@ void bli_packv_init ( pack_schema, bmult_id, - &c, + &a, p, cntx ); @@ -123,42 +104,42 @@ void bli_packv_init } -void bli_packv_init_pack +siz_t bli_packv_init_pack ( - pack_t pack_schema, + pack_t schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ) { - num_t dt = bli_obj_datatype( *c ); - dim_t dim_c = bli_obj_vector_dim( *c ); - dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); + num_t dt = bli_obj_datatype( *a ); + dim_t dim_a = bli_obj_vector_dim( *a ); + dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); - mem_t* mem_p; - dim_t m_p_pad; - siz_t size_p; - inc_t rs_p, cs_p; - void* buf; + membrk_t* membrk = bli_cntx_membrk( cntx ); + +#if 0 + mem_t* mem_p; +#endif + dim_t m_p_pad; + siz_t size_p; + inc_t rs_p, cs_p; + void* buf; // We begin by copying the basic fields of c. - bli_obj_alias_to( *c, *p ); + bli_obj_alias_to( *a, *p ); // Update the dimensions. - bli_obj_set_dims( dim_c, 1, *p ); + bli_obj_set_dims( dim_a, 1, *p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, *p ); // Set the pack schema in the p object to the value in the control tree // node. - bli_obj_set_pack_schema( pack_schema, *p ); - - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); + bli_obj_set_pack_schema( schema, *p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( *p ), bmult ); @@ -166,12 +147,18 @@ void bli_packv_init_pack // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( *p ); +#if 0 + // Extract the address of the mem_t object within p that will track + // properties of the packed buffer. + mem_p = bli_obj_pack_mem( *p ); + if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } else { @@ -179,26 +166,27 @@ void bli_packv_init_pack // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { - bli_mem_release( mem_p ); + bli_membrk_release( mem_p ); - bli_mem_acquire_v( size_p, - mem_p ); + bli_membrk_acquire_v( membrk, + size_p, + mem_p ); } } - // Save the padded (packed) dimensions into the packed object. - bli_obj_set_padded_dims( m_p_pad, 1, *p ); - // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, *p ); +#endif + // Save the padded (packed) dimensions into the packed object. + bli_obj_set_padded_dims( m_p_pad, 1, *p ); // Set the row and column strides of p based on the pack schema. - if ( pack_schema == BLIS_PACKED_VECTOR ) + if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine @@ -209,8 +197,11 @@ void bli_packv_init_pack bli_obj_set_strides( rs_p, cs_p, *p ); } + + return size_p; } +#if 0 void bli_packv_release ( obj_t* p, @@ -218,54 +209,6 @@ void bli_packv_release ) { if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); + bli_obj_release_pack( p ); } - - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) object c is marked as being stored in a standard, contiguous - // format (ie: a column vector), - // (3) the view offset of c is reset to (0,0), and - // (4) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available. (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t dim_a = bli_obj_vector_dim( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Update the dimensions. - bli_obj_set_dims( dim_a, 1, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect a column storage. - // Note that the column stride should never be used. - bli_obj_set_strides( 1, dim_a, *c ); -} -*/ - +#endif diff --git a/frame/1/packv/bli_packv_init.h b/frame/1/other/packv/bli_packv_init.h similarity index 88% rename from frame/1/packv/bli_packv_init.h rename to frame/1/other/packv/bli_packv_init.h index 03d12903c..6104bbdc7 100644 --- a/frame/1/packv/bli_packv_init.h +++ b/frame/1/other/packv/bli_packv_init.h @@ -40,23 +40,12 @@ void bli_packv_init packv_t* cntl ); -void bli_packv_init_pack +siz_t bli_packv_init_pack ( pack_t pack_schema, bszid_t bmult_id, - obj_t* c, + obj_t* a, obj_t* p, cntx_t* cntx ); -void bli_packv_release - ( - obj_t* p, - packv_t* cntl - ); - -/* -void bli_packv_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ diff --git a/frame/1/packv/bli_packv_int.c b/frame/1/other/packv/bli_packv_int.c similarity index 85% rename from frame/1/packv/bli_packv_int.c rename to frame/1/other/packv/bli_packv_int.c index d22f0113e..75cbd193c 100644 --- a/frame/1/packv/bli_packv_int.c +++ b/frame/1/other/packv/bli_packv_int.c @@ -47,27 +47,23 @@ static FUNCPTR_T vars[1][3] = { bli_packv_unb_var1, NULL, NULL } }; -void bli_packv_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packv_t* cntl ) +void bli_packv_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { - // The packv operation consists of an optional typecasting pre-process. - // Here are the following possible ways packv can execute: - // 1. cast and pack: When typecasting and packing are both - // precribed, typecast a to temporary vector c and then pack - // c to p. - // 2. pack only: Typecasting is skipped when it is not needed; - // simply pack a directly to p. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree sometimes directs us to skip the - // pack operation entirely. Alias p to a and return. - - //obj_t c; - +#if 0 varnum_t n; impl_t i; - FUNCPTR_T f; +#endif + packv_voft f; + +// !!! +// DEFINE packv_voft type. +// !!! // Check parameters. if ( bli_error_checking_is_enabled() ) diff --git a/frame/1/packv/bli_packv_int.h b/frame/1/other/packv/bli_packv_int.h similarity index 100% rename from frame/1/packv/bli_packv_int.h rename to frame/1/other/packv/bli_packv_int.h diff --git a/frame/1/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.c rename to frame/1/other/packv/bli_packv_unb_var1.c diff --git a/frame/1/packv/bli_packv_unb_var1.h b/frame/1/other/packv/bli_packv_unb_var1.h similarity index 100% rename from frame/1/packv/bli_packv_unb_var1.h rename to frame/1/other/packv/bli_packv_unb_var1.h diff --git a/frame/1/scalv/bli_scalv_cntl.c b/frame/1/other/scalv/bli_scalv_cntl.c similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.c rename to frame/1/other/scalv/bli_scalv_cntl.c diff --git a/frame/1/scalv/bli_scalv_cntl.h b/frame/1/other/scalv/bli_scalv_cntl.h similarity index 100% rename from frame/1/scalv/bli_scalv_cntl.h rename to frame/1/other/scalv/bli_scalv_cntl.h diff --git a/frame/1/scalv/bli_scalv_int.c b/frame/1/other/scalv/bli_scalv_int.c similarity index 100% rename from frame/1/scalv/bli_scalv_int.c rename to frame/1/other/scalv/bli_scalv_int.c diff --git a/frame/1/scalv/bli_scalv_int.h b/frame/1/other/scalv/bli_scalv_int.h similarity index 100% rename from frame/1/scalv/bli_scalv_int.h rename to frame/1/other/scalv/bli_scalv_int.h diff --git a/frame/1/unpackv/bli_unpackv.c b/frame/1/other/unpackv/bli_unpackv.c similarity index 100% rename from frame/1/unpackv/bli_unpackv.c rename to frame/1/other/unpackv/bli_unpackv.c diff --git a/frame/1/unpackv/bli_unpackv.h b/frame/1/other/unpackv/bli_unpackv.h similarity index 100% rename from frame/1/unpackv/bli_unpackv.h rename to frame/1/other/unpackv/bli_unpackv.h diff --git a/frame/1/unpackv/bli_unpackv_check.c b/frame/1/other/unpackv/bli_unpackv_check.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.c rename to frame/1/other/unpackv/bli_unpackv_check.c diff --git a/frame/1/unpackv/bli_unpackv_check.h b/frame/1/other/unpackv/bli_unpackv_check.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_check.h rename to frame/1/other/unpackv/bli_unpackv_check.h diff --git a/frame/1/unpackv/bli_unpackv_cntl.c b/frame/1/other/unpackv/bli_unpackv_cntl.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.c rename to frame/1/other/unpackv/bli_unpackv_cntl.c diff --git a/frame/1/unpackv/bli_unpackv_cntl.h b/frame/1/other/unpackv/bli_unpackv_cntl.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_cntl.h rename to frame/1/other/unpackv/bli_unpackv_cntl.h diff --git a/frame/1/unpackv/bli_unpackv_int.c b/frame/1/other/unpackv/bli_unpackv_int.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.c rename to frame/1/other/unpackv/bli_unpackv_int.c diff --git a/frame/1/unpackv/bli_unpackv_int.h b/frame/1/other/unpackv/bli_unpackv_int.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_int.h rename to frame/1/other/unpackv/bli_unpackv_int.h diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.c rename to frame/1/other/unpackv/bli_unpackv_unb_var1.c diff --git a/frame/1/unpackv/bli_unpackv_unb_var1.h b/frame/1/other/unpackv/bli_unpackv_unb_var1.h similarity index 100% rename from frame/1/unpackv/bli_unpackv_unb_var1.h rename to frame/1/other/unpackv/bli_unpackv_unb_var1.h diff --git a/frame/1m/bli_l1m.h b/frame/1m/bli_l1m.h index ff9c98459..5c55b97d3 100644 --- a/frame/1m/bli_l1m.h +++ b/frame/1m/bli_l1m.h @@ -36,6 +36,7 @@ #include "bli_l1m_check.h" #include "bli_l1m_ft.h" +#include "bli_l1m_voft.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" @@ -51,6 +52,5 @@ #include "bli_unpackm.h" // Other -#include "bli_scalm_cntl.h" -#include "bli_scalm_int.h" +#include "bli_scalm.h" diff --git a/frame/1m/bli_l1m_voft.h b/frame/1m/bli_l1m_voft.h new file mode 100644 index 000000000..f5fdf5b65 --- /dev/null +++ b/frame/1m/bli_l1m_voft.h @@ -0,0 +1,75 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L1M_VAR_OFT_H +#define BLIS_L1M_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* p, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( packm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* p, \ + obj_t* a, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( unpackm ) + + + +#endif + diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index cc8e84b2d..4ce7b1504 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -93,10 +93,14 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = }; -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ) +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -140,7 +144,7 @@ void bli_packm_blk_var1( obj_t* c, // whether we are executing an induced method. if ( bli_is_nat_packed( schema ) ) { - // This branch if for native execution, where we assume that + // This branch is for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform @@ -156,28 +160,25 @@ void bli_packm_blk_var1( obj_t* c, // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) - if( bli_thread_am_ochief( t ) ) + if ( bli_obj_scalar_has_nonzero_imag( p ) ) { - if ( bli_obj_scalar_has_nonzero_imag( p ) ) - { -//printf( "applying non-zero imag kappa\n" ); - // Detach the scalar. - bli_obj_scalar_detach( p, &kappa ); - - // Reset the attached scalar (to 1.0). - bli_obj_scalar_reset( p ); - - kappa_p = κ - } - else - { - // If the internal scalar of A has only a real component, then - // we will apply it later (in the micro-kernel), and so we will - // use BLIS_ONE to indicate no scaling during packing. - kappa_p = &BLIS_ONE; - } + //printf( "applying non-zero imag kappa\n" ); + + // Detach the scalar. + bli_obj_scalar_detach( p, &kappa ); + + // Reset the attached scalar (to 1.0). + bli_obj_scalar_reset( p ); + + kappa_p = κ + } + else + { + // If the internal scalar of A has only a real component, then + // we will apply it later (in the micro-kernel), and so we will + // use BLIS_ONE to indicate no scaling during packing. + kappa_p = &BLIS_ONE; } - kappa_p = bli_thread_obroadcast( t, kappa_p ); // Acquire the buffer to the kappa chosen above. buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p ); @@ -194,7 +195,12 @@ void bli_packm_blk_var1( obj_t* c, bli_is_rpi_packed( schema ) ) packm_kers = packm_struc_cxk_rih_kers; else packm_kers = packm_struc_cxk_kers; #else - func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); + // The original idea here was to read the packm_ukr from the context + // if it is non-NULL. The problem is, it requires that we be able to + // assume that the packm_ukr field is initialized to NULL, which it + // currently is not. + + //func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx ); //if ( bli_func_is_null_dt( dt_cp, cntx_packm_kers ) ) { @@ -203,7 +209,6 @@ void bli_packm_blk_var1( obj_t* c, // we use the default lookup table to determine the right func_t // for the current schema. const dim_t i = bli_pack_schema_index( schema ); -//printf( "bli_packm_blk_var1: pack schema index = %lu (schema = %x)\n", i, schema ); packm_kers = &packm_struc_cxk_kers[ i ]; } @@ -221,11 +226,6 @@ void bli_packm_blk_var1( obj_t* c, // Query the datatype-specific function pointer from the func_t object. packm_ker = bli_func_get_dt( dt_cp, packm_kers ); - -//bli_cntx_print( cntx ); -//printf( "bli_packm_blk_var1: packm_ker = %p\n", packm_ker ); -//printf( "bli_packm_blk_var1: cntx_packm_ker = %p\n", cntx_packm_kers ); -//printf( "bli_packm_blk_var1: local_table_entry = %p\n", &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ] ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; @@ -598,6 +598,57 @@ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ p_inc = ps_p; \ } \ \ +/* +if ( col_stored ) { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +else { \ + if ( bli_thread_work_id( thread ) == 0 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ + if ( bli_thread_work_id( thread ) == 1 ) \ + { \ + printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ + fflush( stdout ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ + ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ + PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ + ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ + fflush( stdout ); \ + } \ +bli_thread_obarrier( thread ); \ +} \ +*/ \ +\ /* if ( bli_is_4mi_packed( schema ) ) { \ printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \ diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h index 8971da5c0..4e04f86f9 100644 --- a/frame/1m/packm/bli_packm_blk_var1.h +++ b/frame/1m/packm/bli_packm_blk_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_blk_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); +void bli_packm_blk_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* t + ); #undef GENTPROT diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c index 6a56b8676..f8c66eee5 100644 --- a/frame/1m/packm/bli_packm_check.c +++ b/frame/1m/packm/bli_packm_check.c @@ -35,9 +35,12 @@ #include "blis.h" -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; @@ -54,9 +57,12 @@ void bli_packm_init_check( obj_t* a, //bli_check_error_code( e_val ); } -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ) +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h index 9974ced6b..9b2e8a66e 100644 --- a/frame/1m/packm/bli_packm_check.h +++ b/frame/1m/packm/bli_packm_check.h @@ -32,10 +32,17 @@ */ -void bli_packm_init_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); +void bli_packm_init_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); + +void bli_packm_int_check + ( + obj_t* a, + obj_t* p, + cntx_t* cntx + ); -void bli_packm_int_check( obj_t* a, - obj_t* p, - cntx_t* cntx ); diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index f0f674615..67b01fffb 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -34,109 +34,49 @@ #include "blis.h" -packm_t* packm_cntl_row = NULL; -packm_t* packm_cntl_col = NULL; - -packm_t* packm_cntl = NULL; - -void bli_packm_cntl_init() +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ) { - // Generally speaking, the BLIS_PACKED_ROWS and BLIS_PACKED_COLUMNS - // are used by the level-2 operations. These schemas amount to simple - // copies to row or column storage. These simple schemas may be used - // by level-3 operations, but they should never be used for matrices - // with structure (since they do not densify). - // The BLIS_PACKED_ROW_PANELS and BLIS_PACKED_COL_PANELS schemas are - // used only in level-3 operations. They pack to (typically) skinny - // row and column panels, where the width of the panel is determined - // by register blocksizes. It is assumed that matrices with structure - // will be densified. + cntl_t* cntl; + packm_params_t* params; - // Create control trees to pack by rows. - packm_cntl_row - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to rows: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_ROWS, - BLIS_BUFFER_FOR_GEN_USE ); + // Allocate a packm_params_t struct. + params = bli_malloc_intl( sizeof( packm_params_t ) ); + // Initialize the packm_params_t struct. + params->size = sizeof( packm_params_t ); + params->var_func = packm_var_func; + params->bmid_m = bmid_m; + params->bmid_n = bmid_n; + params->does_invert_diag = does_invert_diag; + params->rev_iter_if_upper = rev_iter_if_upper; + params->rev_iter_if_lower = rev_iter_if_lower; + params->pack_schema = pack_schema; + params->pack_buf_type = pack_buf_type; - // Create control trees to pack by columns. - packm_cntl_col - = - bli_packm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, // When packing to columns: - BLIS_VF, // used for m dimension - BLIS_VF, // used for n dimension - FALSE, // do NOT invert diagonal - FALSE, // do NOT iterate backwards if upper - FALSE, // do NOT iterate backwards if lower - BLIS_PACKED_COLUMNS, - BLIS_BUFFER_FOR_GEN_USE ); - - - // Set defaults when we don't care whether the packing is by rows or - // by columns. - packm_cntl = packm_cntl_col; -} - -void bli_packm_cntl_finalize() -{ - bli_cntl_obj_free( packm_cntl_row ); - bli_cntl_obj_free( packm_cntl_col ); -} - -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - packm_t* cntl; - - cntl = ( packm_t* ) bli_malloc_intl( sizeof(packm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bmid_m = bmid_m; - cntl->bmid_n = bmid_n; - cntl->does_invert_diag = does_invert_diag; - cntl->rev_iter_if_upper = rev_iter_if_upper; - cntl->rev_iter_if_lower = rev_iter_if_lower; - cntl->pack_schema = pack_schema; - cntl->pack_buf_type = pack_buf_type; -} - diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 1dc31c543..057a512ed 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -32,56 +32,65 @@ */ -struct packm_s +struct packm_params_s { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid_m; - bszid_t bmid_n; - bool_t does_invert_diag; - bool_t rev_iter_if_upper; - bool_t rev_iter_if_lower; - pack_t pack_schema; - packbuf_t pack_buf_type; + uint64_t size; // size field must be present and come first. + packm_voft var_func; + bszid_t bmid_m; + bszid_t bmid_n; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + pack_t pack_schema; + packbuf_t pack_buf_type; }; -typedef struct packm_s packm_t; +typedef struct packm_params_s packm_params_t; -#define cntl_bmid_m( cntl ) cntl->bmid_m -#define cntl_bmid_n( cntl ) cntl->bmid_n +#define bli_cntl_packm_params_var_func( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->var_func ) -#define cntl_does_invert_diag( cntl ) cntl->does_invert_diag -#define cntl_rev_iter_if_upper( cntl ) cntl->rev_iter_if_upper -#define cntl_rev_iter_if_lower( cntl ) cntl->rev_iter_if_lower -#define cntl_pack_schema( cntl ) cntl->pack_schema -#define cntl_pack_buf_type( cntl ) cntl->pack_buf_type +#define bli_cntl_packm_params_bmid_m( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_m ) -#define bli_cntl_sub_packm( cntl ) cntl->sub_packm -#define bli_cntl_sub_packm_a( cntl ) cntl->sub_packm_a -#define bli_cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11 -#define bli_cntl_sub_packm_b( cntl ) cntl->sub_packm_b -#define bli_cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11 -#define bli_cntl_sub_packm_c( cntl ) cntl->sub_packm_c -#define bli_cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11 +#define bli_cntl_packm_params_bmid_n( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->bmid_n ) -void bli_packm_cntl_init( void ); -void bli_packm_cntl_finalize( void ); -packm_t* bli_packm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); -void bli_packm_cntl_obj_init( packm_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid_m, - bszid_t bmid_n, - bool_t does_invert_diag, - bool_t rev_iter_if_upper, - bool_t rev_iter_if_lower, - pack_t pack_schema, - packbuf_t pack_buf_type ); +#define bli_cntl_packm_params_does_invert_diag( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->does_invert_diag ) + +#define bli_cntl_packm_params_rev_iter_if_upper( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_upper ) + +#define bli_cntl_packm_params_rev_iter_if_lower( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->rev_iter_if_lower ) + +#define bli_cntl_packm_params_pack_schema( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_schema ) + +#define bli_cntl_packm_params_pack_buf_type( cntl ) \ +\ + ( ( (packm_params_t*)(cntl)->params )->pack_buf_type ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_packm_cntl_obj_create + ( + void* var_func, + void* packm_var_func, + bszid_t bmid_m, + bszid_t bmid_n, + bool_t does_invert_diag, + bool_t rev_iter_if_upper, + bool_t rev_iter_if_lower, + pack_t pack_schema, + packbuf_t pack_buf_type, + cntl_t* sub_node + ); diff --git a/frame/1m/packm/bli_packm_cntx.c b/frame/1m/packm/bli_packm_cntx.c index 787531f41..4f570400a 100644 --- a/frame/1m/packm/bli_packm_cntx.c +++ b/frame/1m/packm/bli_packm_cntx.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -49,6 +50,9 @@ void bli_packm_cntx_init( cntx_t* cntx ) bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SCAL2V_KER, cntx ); bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx ); + + // Initialize the context with the global membrk object. + bli_cntx_set_membrk( bli_memsys_global_membrk(), cntx ); } void bli_packm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index cb6f28fe2..ccf88f3cb 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,38 +35,43 @@ #include "blis.h" -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ) +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ) { // The purpose of packm_init() is to initialize an object P so that // a source object A can be packed into P via one of the packm - // implementations. This initialization includes acquiring a suitable - // block of memory from the memory allocator, if such a block of memory - // has not already been allocated previously. + // implementations. This initialization precedes the acquisition of a + // suitable block of memory from the memory allocator (if such a block + // of memory has not already been allocated previously). - invdiag_t invert_diag; - pack_t schema; - packord_t pack_ord_if_up; - packord_t pack_ord_if_lo; - packbuf_t pack_buf_type; bszid_t bmult_id_m; bszid_t bmult_id_n; - obj_t c; + bool_t does_invert_diag; + bool_t rev_iter_if_upper; + bool_t rev_iter_if_lower; + //pack_t pack_schema; + packbuf_t pack_buf_type; + siz_t size_needed; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_init_check( a, p, cntx ); - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply alias the object to its packed counterpart. - if ( bli_cntl_is_noop( cntl ) ) - { - bli_obj_alias_to( *a, *p ); - return; - } + // Extract various fields from the control tree. + bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); + bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); + does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); + rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); + rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); + //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); +#if 0 // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can alias the object and return. @@ -78,177 +84,150 @@ void bli_packm_init( obj_t* a, if ( bli_obj_pack_schema( *a ) == BLIS_PACKED_UNSPEC ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } - // At this point, we can be assured that cntl is not NULL. Now we check - // if the object has already been packed to the desired schema (as en- - // coded in the control tree). If so, we can alias and return, as above. + // Now we check if the object has already been packed to the desired + // schema (as encoded in the control tree). If so, we can alias and + // return 0. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == pack_schema ) { bli_obj_alias_to( *a, *p ); - return; + return 0; } +#endif // If the object is marked as being filled with zeros, then we can skip - // the packm operation entirely and alias. Notice that we use pack-aware - // aliasing. This is needed because the object may have been packed in - // a previous iteration, which means the object currently contains the - // mem_t entry of an already-allocated block. bli_obj_alias_for_packing() - // will avoid overwriting that mem_t entry, which means it can be - // properly released later on. + // the packm operation entirely and alias. if ( bli_obj_is_zeros( *a ) ) { - bli_obj_alias_for_packing( *a, *p ); - return; + bli_obj_alias_to( *a, *p ); + return 0; } - // Now, if we are not skipping the pack operation, then the only question - // left is whether we are to typecast matrix a before packing. - if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) ) - bli_abort(); -/* - { - // Initialize an object c for the intermediate typecast matrix. - bli_packm_init_cast( a, - p, - &c ); - - // Copy/typecast matrix a to matrix c. - bli_copym( a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the packm - // implementation to pack directly from matrix a. - bli_obj_alias_to( *a, c ); - } - - - // Extract various fields from the control tree. - pack_buf_type = cntl_pack_buf_type( cntl ); - bmult_id_m = cntl_bmid_m( cntl ); - bmult_id_n = cntl_bmid_n( cntl ); - - // Extract the schema from the context, depending on whether we are + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are // preparing to pack a block of A or panel of B. For A and B, we must // obtain the schema from the context since the induced methods reuse // the same control trees used by native execution, and those induced // methods specify the schema used by the current execution phase // within the context (whereas the control tree does not change). + pack_t schema; + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { schema = bli_cntx_get_pack_schema_a( cntx ); -//printf( "bli_packm_init: pack schema a = %x\n", schema ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { schema = bli_cntx_get_pack_schema_b( cntx ); -//printf( "bli_packm_init: pack schema b = %x\n", schema ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. - schema = cntl_pack_schema( cntl ); -//printf( "bli_packm_init: pack schema c = %x\n", schema ); + schema = bli_cntl_packm_params_pack_schema( cntl ); } // Prepare a few other variables based on properties of the control // tree. - if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG; - else invert_diag = BLIS_NO_INVERT_DIAG; + invdiag_t invert_diag; + packord_t pack_ord_if_up; + packord_t pack_ord_if_lo; - if ( cntl_rev_iter_if_upper( cntl ) ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; - else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG; + else invert_diag = BLIS_NO_INVERT_DIAG; - if ( cntl_rev_iter_if_lower( cntl ) ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; - else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; + if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER; + else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER; + + if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER; + else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER; // Initialize object p for the final packed matrix. - bli_packm_init_pack( invert_diag, - schema, - pack_ord_if_up, - pack_ord_if_lo, - pack_buf_type, - bmult_id_m, - bmult_id_n, - &c, - p, - cntx ); + size_needed + = + bli_packm_init_pack + ( + invert_diag, + schema, + pack_ord_if_up, + pack_ord_if_lo, + bmult_id_m, + bmult_id_n, + a, + p, + cntx + ); - // Now p is ready to be packed. + // Return the size needed for memory allocation of the packed buffer. + return size_needed; } -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t bmult_id_m, - bszid_t bmult_id_n, - obj_t* c, - obj_t* p, - cntx_t* cntx ) +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ) { - num_t dt = bli_obj_datatype( *c ); - trans_t transc = bli_obj_onlytrans_status( *c ); - dim_t m_c = bli_obj_length( *c ); - dim_t n_c = bli_obj_width( *c ); - dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); - dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); - dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); - dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); + num_t dt = bli_obj_datatype( *a ); + trans_t transa = bli_obj_onlytrans_status( *a ); + dim_t m_a = bli_obj_length( *a ); + dim_t n_a = bli_obj_width( *a ); + dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx ); + dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx ); + dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx ); + dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx ); - mem_t* mem_p; - dim_t m_p, n_p; - dim_t m_p_pad, n_p_pad; - siz_t size_p; - siz_t elem_size_p; - inc_t rs_p, cs_p; - inc_t is_p; - void* buf; + dim_t m_p, n_p; + dim_t m_p_pad, n_p_pad; + siz_t size_p; + siz_t elem_size_p; + inc_t rs_p, cs_p; + inc_t is_p; - // We begin by copying the basic fields of c. We do NOT copy the - // pack_mem entry from c because the entry in p may be cached from - // a previous iteration, and thus we don't want to overwrite it. - bli_obj_alias_for_packing( *c, *p ); + // We begin by copying the fields of A. + bli_obj_alias_to( *a, *p ); // Update the dimension fields to explicitly reflect a transposition, // if needed. // Then, clear the conjugation and transposition fields from the object // since matrix packing in BLIS is deemed to take care of all conjugation // and transposition necessary. - // Then, we adjust the properties of p when c needs a transposition. - // We negate the diagonal offset, and if c is upper- or lower-stored, - // we either toggle the uplo of p. - // Finally, if we mark p as dense since we assume that all matrices, + // Then, we adjust the properties of P when A needs a transposition. + // We negate the diagonal offset, and if A is upper- or lower-stored, + // we either toggle the uplo of P. + // Finally, if we mark P as dense since we assume that all matrices, // regardless of structure, will be densified. - bli_obj_set_dims_with_trans( transc, m_c, n_c, *p ); + bli_obj_set_dims_with_trans( transa, m_a, n_a, *p ); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, *p ); - if ( bli_does_trans( transc ) ) + if ( bli_does_trans( transa ) ) { bli_obj_negate_diag_offset( *p ); - if ( bli_obj_is_upper_or_lower( *c ) ) + if ( bli_obj_is_upper_or_lower( *a ) ) bli_obj_toggle_uplo( *p ); } - // If we are packing micro-panels, mark p as dense. Otherwise, we are + // If we are packing micro-panels, mark P as dense. Otherwise, we are // probably being called in the context of a level-2 operation, in - // which case we do not want to overwrite the uplo field of p (inherited - // from c) with BLIS_DENSE because that information may be needed by + // which case we do not want to overwrite the uplo field of P (inherited + // from A) with BLIS_DENSE because that information may be needed by // the level-2 operation's unblocked variant to decide whether to // execute a "lower" or "upper" branch of code. if ( bli_is_panel_packed( schema ) ) @@ -262,7 +241,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, // Set the invert diagonal field. bli_obj_set_invert_diag( invert_diag, *p ); - // Set the pack status of p to the pack schema prescribed in the control + // Set the pack status of P to the pack schema prescribed in the control // tree node. bli_obj_set_pack_schema( schema, *p ); @@ -270,15 +249,11 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_pack_order_if_upper( pack_ord_if_up, *p ); bli_obj_set_pack_order_if_lower( pack_ord_if_lo, *p ); - // Extract the address of the mem_t object within p that will track - // properties of the packed buffer. - mem_p = bli_obj_pack_mem( *p ); - // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. - // We compute them by starting with the effective dimensions of c (now - // in p) and aligning them to the dimension multiples (typically equal + // We compute them by starting with the effective dimensions of A (now + // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. m_p = bli_obj_length( *p ); @@ -292,9 +267,9 @@ void bli_packm_init_pack( invdiag_t invert_diag, bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p ); // Now we prepare to compute strides, align them, and compute the - // total number of bytes needed for the packed buffer. After that, - // we will acquire an appropriate block of memory from the memory - // allocator. + // total number of bytes needed for the packed buffer. The caller + // will then use that value to acquire an appropriate block of memory + // from the memory allocator. // Extract the element size for the packed object. elem_size_p = bli_obj_elem_size( *p ); @@ -317,7 +292,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, rs_p = bli_align_dim_to_size( rs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -340,7 +315,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, cs_p = bli_align_dim_to_size( cs_p, elem_size_p, BLIS_HEAP_STRIDE_ALIGN_SIZE ); - // Store the strides in p. + // Store the strides in P. bli_obj_set_strides( rs_p, cs_p, *p ); // Compute the size of the packed buffer. @@ -428,7 +403,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( m_panel, *p ); @@ -521,7 +496,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel ); else is_p = 1; - // Store the strides and panel dimension in p. + // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, *p ); bli_obj_set_imag_stride( is_p, *p ); bli_obj_set_panel_dim( n_panel, *p ); @@ -544,97 +519,6 @@ void bli_packm_init_pack( invdiag_t invert_diag, size_p = 0; } - - if ( bli_mem_is_unalloc( mem_p ) ) - { - // If the mem_t object of p has not yet been allocated, then acquire - // a memory block of type pack_buf_type. - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); - } - else - { - // If the mem_t object is currently allocated and smaller than is - // needed, then it must have been allocated for a different type - // of object (a different pack_buf_type value), so we must first - // release it and then re-acquire it using the new size and new - // pack_buf_type value. - if ( bli_mem_size( mem_p ) < size_p ) - { - bli_mem_release( mem_p ); - bli_mem_acquire_m( size_p, - pack_buf_type, - mem_p ); - } - } - - // Grab the buffer address from the mem_t object and copy it to the - // main object buffer field. (Sometimes this buffer address will be - // copied when the value is already up-to-date, because it persists - // in the main object buffer field across loop iterations.) - buf = bli_mem_buffer( mem_p ); - bli_obj_set_buffer( buf, *p ); - + return size_p; } -void bli_packm_release( obj_t* p, - packm_t* cntl ) -{ - if ( !bli_cntl_is_noop( cntl ) ) - bli_obj_release_pack( p ); -} - - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: a column-major order). - // Any transposition encoded within object a will not be handled here, - // but rather will be handled in the packm implementation. That way, - // the only thing castm needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ - diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h index a21956ba2..fe0de52fc 100644 --- a/frame/1m/packm/bli_packm_init.h +++ b/frame/1m/packm/bli_packm_init.h @@ -32,28 +32,24 @@ */ -void bli_packm_init( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl ); +siz_t bli_packm_init + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl + ); -void bli_packm_init_pack( invdiag_t invert_diag, - pack_t pack_schema, - packord_t pack_ord_if_up, - packord_t pack_ord_if_lo, - packbuf_t pack_buf_type, - bszid_t mr_id, - bszid_t nr_id, - obj_t* c, - obj_t* p, - cntx_t* cntx ); - -/* -void bli_packm_init_cast( obj_t* a, - obj_t* p, - obj_t* c ); -*/ - -void bli_packm_release( obj_t* p, - packm_t* cntl ); +siz_t bli_packm_init_pack + ( + invdiag_t invert_diag, + pack_t schema, + packord_t pack_ord_if_up, + packord_t pack_ord_if_lo, + bszid_t bmult_id_m, + bszid_t bmult_id_n, + obj_t* a, + obj_t* p, + cntx_t* cntx + ); diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c index 7d55c2a64..22ce70a44 100644 --- a/frame/1m/packm/bli_packm_int.c +++ b/frame/1m/packm/bli_packm_int.c @@ -34,33 +34,16 @@ #include "blis.h" -#define FUNCPTR_T packm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* p, - cntx_t* cntx, - thrinfo_t* t ); - -static FUNCPTR_T vars[6][3] = +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_packm_unb_var1, NULL, bli_packm_blk_var1 }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL, }, -}; - -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ) -{ - varnum_t n; - impl_t i; - FUNCPTR_T f; + packm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -70,14 +53,6 @@ void bli_packm_int( obj_t* a, // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - // First check if we are to skip this operation because the control tree - // is NULL. We return without taking any action because a was already - // aliased to p in packm_init(). - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } - // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already @@ -101,7 +76,7 @@ void bli_packm_int( obj_t* a, // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. - if ( bli_obj_pack_schema( *a ) == cntl_pack_schema( cntl ) ) + if ( bli_obj_pack_schema( *a ) == bli_cntl_packm_params_pack_schema( cntl ) ) { return; } @@ -113,21 +88,17 @@ void bli_packm_int( obj_t* a, return; } - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_packm_params_var_func( cntl ); // Invoke the variant with kappa_use. - f( a, - p, - cntx, - thread ); - - // Barrier so that packing is done before computation - bli_thread_obarrier( thread ); + f + ( + a, + p, + cntx, + cntl, + thread + ); } diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h index 89bd4f0d5..14d006d28 100644 --- a/frame/1m/packm/bli_packm_int.h +++ b/frame/1m/packm/bli_packm_int.h @@ -32,9 +32,11 @@ */ -void bli_packm_int( obj_t* a, - obj_t* p, - cntx_t* cntx, - packm_t* cntl, - thrinfo_t* thread ); - +void bli_packm_int + ( + obj_t* a, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c index 47f0dc362..2287a7222 100644 --- a/frame/1m/packm/bli_packm_thrinfo.c +++ b/frame/1m/packm/bli_packm_thrinfo.c @@ -34,14 +34,14 @@ #include "blis.h" +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -50,37 +50,33 @@ thrinfo_t* bli_packm_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); return thread; } +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ) { bli_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - NULL, - NULL, - NULL + FALSE, + sub_node ); } @@ -93,12 +89,13 @@ void bli_packm_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, - 0 + 0, + NULL ); } +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread @@ -108,4 +105,4 @@ void bli_packm_thrinfo_free thread != &BLIS_PACKM_SINGLE_THREADED ) bli_free_intl( thread ); } - +#endif diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/1m/packm/bli_packm_thrinfo.h index 45ab46c3c..5da496f96 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/1m/packm/bli_packm_thrinfo.h @@ -42,25 +42,25 @@ // thrinfo_t APIs specific to packm. // +#if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); +#endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, - dim_t work_id + dim_t work_id, + thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single @@ -68,8 +68,10 @@ void bli_packm_thrinfo_init_single thrinfo_t* thread ); +#if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); +#endif diff --git a/frame/1m/packm/bli_packm_unb_var1.c b/frame/1m/packm/bli_packm_unb_var1.c index 75e999320..49b3a918a 100644 --- a/frame/1m/packm/bli_packm_unb_var1.c +++ b/frame/1m/packm/bli_packm_unb_var1.c @@ -55,10 +55,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1); -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ) +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); diff --git a/frame/1m/packm/bli_packm_unb_var1.h b/frame/1m/packm/bli_packm_unb_var1.h index 3d737d483..cefd4de94 100644 --- a/frame/1m/packm/bli_packm_unb_var1.h +++ b/frame/1m/packm/bli_packm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_packm_unb_var1( obj_t* c, - obj_t* p, - cntx_t* cntx, - thrinfo_t* thread ); +void bli_packm_unb_var1 + ( + obj_t* c, + obj_t* p, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT diff --git a/frame/3/trsm/old/bli_trsm_cntx.h b/frame/1m/scalm/bli_scalm.h similarity index 96% rename from frame/3/trsm/old/bli_trsm_cntx.h rename to frame/1m/scalm/bli_scalm.h index 0bdc9e7a8..303ec3860 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.h +++ b/frame/1m/scalm/bli_scalm.h @@ -32,6 +32,5 @@ */ -void bli_trsm_cntx_init( void ); -void bli_trsm_cntx_finalize( void ); +#include "bli_scalm_cntl.h" diff --git a/frame/1m/scalm/bli_scalm_cntl.c b/frame/1m/scalm/bli_scalm_cntl.c index 4a965b3fa..f6008a9a3 100644 --- a/frame/1m/scalm/bli_scalm_cntl.c +++ b/frame/1m/scalm/bli_scalm_cntl.c @@ -34,38 +34,25 @@ #include "blis.h" -scalm_t* scalm_cntl = NULL; - -void bli_scalm_cntl_init() +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ) { - scalm_cntl = bli_scalm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1 ); -} + cntl_t* cntl; -void bli_scalm_cntl_finalize() -{ - bli_cntl_obj_free( scalm_cntl ); -} - - -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ) -{ - scalm_t* cntl; - - cntl = ( scalm_t* ) bli_malloc_intl( sizeof(scalm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + NULL, + sub_node + ); return cntl; } - -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; -} - diff --git a/frame/1m/scalm/bli_scalm_cntl.h b/frame/1m/scalm/bli_scalm_cntl.h index ccda9217e..4029a4f10 100644 --- a/frame/1m/scalm/bli_scalm_cntl.h +++ b/frame/1m/scalm/bli_scalm_cntl.h @@ -32,20 +32,9 @@ */ -struct scalm_s -{ - impl_t impl_type; - varnum_t var_num; -}; -typedef struct scalm_s scalm_t; - -#define bli_cntl_sub_scalm( cntl ) cntl->sub_scalm - -void bli_scalm_cntl_init( void ); -void bli_scalm_cntl_finalize( void ); -scalm_t* bli_scalm_cntl_obj_create( impl_t impl_type, - varnum_t var_num ); -void bli_scalm_cntl_obj_init( scalm_t* cntl, - impl_t impl_type, - varnum_t var_num ); +cntl_t* bli_scalm_cntl_obj_create + ( + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/1m/scalm/bli_scalm_int.c b/frame/1m/scalm/other/bli_scalm_int.c similarity index 100% rename from frame/1m/scalm/bli_scalm_int.c rename to frame/1m/scalm/other/bli_scalm_int.c diff --git a/frame/1m/scalm/bli_scalm_int.h b/frame/1m/scalm/other/bli_scalm_int.h similarity index 100% rename from frame/1m/scalm/bli_scalm_int.h rename to frame/1m/scalm/other/bli_scalm_int.h diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h index 8254f5043..e300cb66f 100644 --- a/frame/1m/unpackm/bli_unpackm.h +++ b/frame/1m/unpackm/bli_unpackm.h @@ -37,8 +37,7 @@ #include "bli_unpackm_int.h" #include "bli_unpackm_unb_var1.h" -//#include "bli_unpackm_blk_var1.h" -#include "bli_unpackm_blk_var2.h" +#include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c similarity index 96% rename from frame/1m/unpackm/bli_unpackm_blk_var2.c rename to frame/1m/unpackm/bli_unpackm_blk_var1.c index ab2c2cf1c..bb9f0ee22 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.c +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c @@ -52,13 +52,17 @@ typedef void (*FUNCPTR_T)( cntx_t* cntx ); -static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var2); +static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_cp = bli_obj_datatype( *c ); @@ -266,5 +270,5 @@ void PASTEMAC(ch,varname) \ \ } -INSERT_GENTFUNC_BASIC0( unpackm_blk_var2 ) +INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 ) diff --git a/frame/3/trmm/old/bli_trmm_thread.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h similarity index 71% rename from frame/3/trmm/old/bli_trmm_thread.h rename to frame/1m/unpackm/bli_unpackm_blk_var1.h index bedc7781f..330e9b089 100644 --- a/frame/3/trmm/old/bli_trmm_thread.h +++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h @@ -32,14 +32,35 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +void bli_unpackm_blk_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); -#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) -//thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ); +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + struc_t strucc, \ + doff_t diagoffc, \ + diag_t diagc, \ + uplo_t uploc, \ + trans_t transc, \ + dim_t m, \ + dim_t n, \ + dim_t m_panel, \ + dim_t n_panel, \ + void* p, inc_t rs_p, inc_t cs_p, \ + dim_t pd_p, inc_t ps_p, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx \ + ); + +INSERT_GENTPROT_BASIC( unpackm_blk_var1 ) diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c index 87af08f43..0ffa984b2 100644 --- a/frame/1m/unpackm/bli_unpackm_check.c +++ b/frame/1m/unpackm/bli_unpackm_check.c @@ -34,10 +34,12 @@ #include "blis.h" -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ) { err_t e_val; diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h index 217b03c4a..889dd7831 100644 --- a/frame/1m/unpackm/bli_unpackm_check.h +++ b/frame/1m/unpackm/bli_unpackm_check.h @@ -32,7 +32,10 @@ */ -void bli_unpackm_check( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_int_check + ( + obj_t* p, + obj_t* a, + cntx_t* cntx + ); + diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c index 0e99bb741..2900cb3b8 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.c +++ b/frame/1m/unpackm/bli_unpackm_cntl.c @@ -34,42 +34,35 @@ #include "blis.h" -unpackm_t* unpackm_cntl = NULL; - -void bli_unpackm_cntl_init() +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ) { - unpackm_cntl = bli_unpackm_cntl_obj_create( BLIS_UNBLOCKED, - BLIS_VARIANT1, - NULL ); // no blocksize needed -} + cntl_t* cntl; + unpackm_params_t* params; -void bli_unpackm_cntl_finalize() -{ - bli_cntl_obj_free( unpackm_cntl ); -} + // Allocate an unpackm_params_t struct. + params = bli_malloc_intl( sizeof( unpackm_params_t ) ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - unpackm_t* cntl; + // Initialize the unpackm_params_t struct. + params->size = sizeof( unpackm_params_t ); + params->var_func = unpackm_var_func; - cntl = ( unpackm_t* ) bli_malloc_intl( sizeof(unpackm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; + // It's important that we set the bszid field to BLIS_NO_PART to indicate + // that no blocksize partitioning is performed. bli_cntl_free() will rely + // on this information to know how to step through the thrinfo_t tree in + // sync with the cntl_t tree. + cntl = bli_cntl_obj_create + ( + BLIS_NO_PART, + var_func, + params, + sub_node + ); return cntl; } -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ) -{ - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->b = b; -} - diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/1m/unpackm/bli_unpackm_cntl.h index 8a3935ba4..82d9727fc 100644 --- a/frame/1m/unpackm/bli_unpackm_cntl.h +++ b/frame/1m/unpackm/bli_unpackm_cntl.h @@ -32,28 +32,23 @@ */ -struct unpackm_s +struct unpackm_params_s { - impl_t impl_type; - varnum_t var_num; - blksz_t* b; + uint64_t size; // size field must be present and come first. + unpackm_voft var_func; }; -typedef struct unpackm_s unpackm_t; +typedef struct unpackm_params_s unpackm_params_t; -#define bli_cntl_sub_unpackm( cntl ) cntl->sub_unpackm -#define bli_cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a -#define bli_cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11 -#define bli_cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b -#define bli_cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11 -#define bli_cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c -#define bli_cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11 +#define bli_cntl_unpackm_params_var_func( cntl ) \ +\ + ( ( (unpackm_params_t*)(cntl)->params )->var_func ) + +// ----------------------------------------------------------------------------- + +cntl_t* bli_unpackm_cntl_obj_create + ( + void* var_func, + void* unpackm_var_func, + cntl_t* sub_node + ); -void bli_unpackm_cntl_init( void ); -void bli_unpackm_cntl_finalize( void ); -unpackm_t* bli_unpackm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - blksz_t* b ); -void bli_unpackm_cntl_obj_init( unpackm_t* cntl, - impl_t impl_type, - varnum_t var_num, - blksz_t* b ); diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c index a31a7f9dc..0ffaa78e5 100644 --- a/frame/1m/unpackm/bli_unpackm_cxk.c +++ b/frame/1m/unpackm/bli_unpackm_cxk.c @@ -152,15 +152,16 @@ static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ -void PASTEMAC(ch,opname)( \ - conj_t conjp, \ - dim_t m, \ - dim_t n, \ - void* beta, \ - void* p, inc_t ldp, \ - void* a, inc_t inca, inc_t lda, \ - cntx_t* cntx \ - ) \ +void PASTEMAC(ch,opname) \ + ( \ + conj_t conjp, \ + dim_t m, \ + dim_t n, \ + void* beta, \ + void* p, inc_t ldp, \ + void* a, inc_t inca, inc_t lda, \ + cntx_t* cntx \ + ) \ { \ dim_t panel_dim; \ num_t dt; \ diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index 62b2b3530..b76d325b9 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -34,188 +34,43 @@ #include "blis.h" -#define FUNCPTR_T unpackm_fp - -typedef void (*FUNCPTR_T)( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl ); - -static FUNCPTR_T vars[2][3] = +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // unblocked optimized unblocked blocked - { bli_unpackm_unb_var1, NULL, NULL, }, - { NULL, NULL, bli_unpackm_blk_var2, }, -}; + unpackm_voft f; -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ) -{ - // The unpackm operation consists of an optional post-process: castm. - // (This post-process is analogous to the castm pre-process in packm.) - // Here are the following possible ways unpackm can execute: - // 1. unpack and cast: Unpack to a temporary matrix c and then cast - // c to a. - // 2. unpack only: Unpack directly to matrix a since typecasting is - // not needed. - // 3. cast only: Not yet supported / not used. - // 4. no-op: The control tree directs us to skip the unpack operation - // entirely. No action is taken. - - obj_t c; - - varnum_t n; - impl_t i; - FUNCPTR_T f; - - // Sanity check; A should never have a zero dimension. If we must support - // it, then we should fold it into the next alias-and-early-exit block. - //if ( bli_obj_has_zero_dim( *a ) ) bli_abort(); - - // First check if we are to skip this operation because the control tree - // is NULL, and if so, simply return. - if ( bli_cntl_is_noop( cntl ) ) - { - return; - } + // Check parameters. + if ( bli_error_checking_is_enabled() ) + bli_unpackm_int_check( p, a, cntx ); // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. - if ( bli_obj_is_alias_of( *p, *a ) ) - { - return; - } + if ( bli_obj_is_alias_of( *p, *a ) ) return; - // Check parameters. - if ( bli_error_checking_is_enabled() ) - bli_unpackm_check( p, a, cntx, cntl ); - - // Now, if we are not skipping the unpack operation, then the only - // question left is whether we are to typecast matrix a after unpacking. - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - bli_abort(); -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) - { - // Initialize an object c for the intermediate typecast matrix. - bli_unpackm_init_cast( p, - a, - &c ); - } - else -*/ - { - // If no cast is needed, then aliasing object c to the original - // matrix serves as a minor optimization. This causes the unpackm - // implementation to unpack directly into matrix a. - bli_obj_alias_to( *a, c ); - } - - // Now we are ready to proceed with the unpacking. - - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); - - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. - if( bli_thread_am_ochief( thread ) ) { - f( p, - &c, - cntx, - cntl ); - } - bli_thread_obarrier( thread ); - - // Now, if necessary, we cast the contents of c to matrix a. If casting - // was not necessary, then we are done because the call to the unpackm - // implementation would have unpacked directly to matrix a. -/* - if ( bli_obj_datatype( *p ) != bli_obj_datatype( *a ) ) + if ( bli_thread_am_ochief( thread ) ) { - // Copy/typecast matrix c to matrix a. - // NOTE: Here, we use copynzm instead of copym because, in the cases - // where we are unpacking/typecasting a real matrix c to a complex - // matrix a, we want to touch only the real components of a, rather - // than also set the imaginary components to zero. This comes about - // because of the fact that, if we are unpacking real-to-complex, - // then it is because all of the computation occurred in the real - // domain, and so we would want to leave whatever imaginary values - // there are in matrix a untouched. Notice that for unpackings that - // entail complex-to-complex data movements, the copynzm operation - // behaves exactly as copym, so no use cases are lost (at least none - // that I can think of). - bli_copynzm( &c, - a ); + f + ( + p, + a, + cntx, + cntl, + thread + ); + } - // NOTE: The above code/comment is outdated. What should happen is - // as follows: - // - If dt(a) is complex and dt(p) is real, then create an alias of - // a and then tweak it so that it looks like a real domain object. - // This will involve: - // - projecting the datatype to real domain - // - scaling both the row and column strides by 2 - // ALL OF THIS should be done in the front-end, NOT here, as - // unpackm() won't even be needed in that case. - } -*/ + // Barrier so that unpacking is done before computation. + bli_thread_obarrier( thread ); } -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ) -{ - // The idea here is that we want to create an object c that is identical - // to object a, except that: - // (1) the storage datatype of c is equal to the target datatype of a, - // with the element size of c adjusted accordingly, - // (2) the view offset of c is reset to (0,0), - // (3) object c's main buffer is set to a new memory region acquired - // from the memory manager, or extracted from p if a mem entry is - // already available, (After acquring a mem entry from the memory - // manager, it is cached within p for quick access later on.) - // (4) object c is marked as being stored in a standard, contiguous - // format (ie: column-major order). - // Any transposition encoded within object a will also be encoded in - // object c. That way, unpackm handles any needed transposition during - // the unpacking, and the only thing the cast stage needs to do is cast. - - num_t dt_targ_a = bli_obj_target_datatype( *a ); - dim_t m_a = bli_obj_length( *a ); - siz_t elem_size_c = bli_datatype_size( dt_targ_a ); - - inc_t rs_c, cs_c; - - // We begin by copying the basic fields of a. - bli_obj_alias_to( *a, *c ); - - // Update datatype and element size fields. - bli_obj_set_datatype( dt_targ_a, *c ); - bli_obj_set_elem_size( elem_size_c, *c ); - - // Reset the view offsets to (0,0). - bli_obj_set_offs( 0, 0, *c ); - - // Check the mem_t entry of p associated with the cast buffer. If it is - // NULL, then acquire memory sufficient to hold the object data and cache - // it to p. (Otherwise, if it is non-NULL, then memory has already been - // acquired from the memory manager and cached.) We then set the main - // buffer of c to the cached address of the cast memory. - bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); - - // Update the strides. We set the increments to reflect column-major order - // storage. We start the leading dimension out as m(a) and increment it if - // necessary so that the beginning of each column is aligned. - cs_c = bli_align_dim_to_size( m_a, elem_size_c, - BLIS_HEAP_STRIDE_ALIGN_SIZE ); - rs_c = 1; - bli_obj_set_strides( rs_c, cs_c, *c ); -} -*/ diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h index 6e7a26a13..26cf7877b 100644 --- a/frame/1m/unpackm/bli_unpackm_int.h +++ b/frame/1m/unpackm/bli_unpackm_int.h @@ -32,14 +32,12 @@ */ -void bli_unpackm_int( obj_t* p, - obj_t* a, - cntx_t* cntx, - unpackm_t* cntl, - thrinfo_t* thread ); +void bli_unpackm_int + ( + obj_t* p, + obj_t* a, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); -/* -void bli_unpackm_init_cast( obj_t* p, - obj_t* a, - obj_t* c ); -*/ diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.c b/frame/1m/unpackm/bli_unpackm_unb_var1.c index 0794f6c4f..9e86a78de 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.c +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.c @@ -50,10 +50,14 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1); -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ) +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_pc = bli_obj_datatype( *p ); diff --git a/frame/1m/unpackm/bli_unpackm_unb_var1.h b/frame/1m/unpackm/bli_unpackm_unb_var1.h index fcb98bda5..40c921522 100644 --- a/frame/1m/unpackm/bli_unpackm_unb_var1.h +++ b/frame/1m/unpackm/bli_unpackm_unb_var1.h @@ -32,10 +32,14 @@ */ -void bli_unpackm_unb_var1( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +void bli_unpackm_unb_var1 + ( + obj_t* p, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ diff --git a/frame/2/gemv/bli_gemv.h b/frame/2/gemv/bli_gemv.h index b7c39613c..b4c6b4816 100644 --- a/frame/2/gemv/bli_gemv.h +++ b/frame/2/gemv/bli_gemv.h @@ -32,9 +32,10 @@ */ -#include "bli_gemv_cntl.h" -#include "bli_gemv_front.h" -#include "bli_gemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_gemv_cntl.h" +//#include "bli_gemv_front.h" +//#include "bli_gemv_int.h" #include "bli_gemv_var.h" diff --git a/frame/2/gemv/bli_gemv_var.h b/frame/2/gemv/bli_gemv_var.h index 9dd3f5d71..4e2a03908 100644 --- a/frame/2/gemv/bli_gemv_var.h +++ b/frame/2/gemv/bli_gemv_var.h @@ -48,7 +48,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) diff --git a/frame/2/gemv/bli_gemv_var_oapi.c b/frame/2/gemv/bli_gemv_var_oapi.c index 6d27452c2..f1662c922 100644 --- a/frame/2/gemv/bli_gemv_var_oapi.c +++ b/frame/2/gemv/bli_gemv_var_oapi.c @@ -45,7 +45,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - gemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/3/trsm/old/bli_trsm_cntx.c b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev similarity index 52% rename from frame/3/trsm/old/bli_trsm_cntx.c rename to frame/2/gemv/old/bli_gemv_var_oapi.c.prev index 186c146df..771cfbf12 100644 --- a/frame/3/trsm/old/bli_trsm_cntx.c +++ b/frame/2/gemv/old/bli_gemv_var_oapi.c.prev @@ -34,43 +34,64 @@ #include "blis.h" -void bli_trsm_cntx_init( cntx_t* cntx ) -{ - // Perform basic setup on the context. - bli_cntx_obj_create( cntx ); +#undef GENFRONT +#define GENFRONT( ftname, opname ) \ +\ +/*static gemv_vft GENARRAY(ftypes,gemv_unb_var1);*/ \ +static GENARRAY_VFP(ftname,opname); \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* x, \ + obj_t* beta, \ + obj_t* y, \ + cntx_t* cntx, \ + gemv_t* cntl \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *a ); \ +\ + trans_t transa = bli_obj_conjtrans_status( *a ); \ + conj_t conjx = bli_obj_conj_status( *x ); \ +\ + dim_t m = bli_obj_length( *a ); \ + dim_t n = bli_obj_width( *a ); \ +\ + void* buf_a = bli_obj_buffer_at_off( *a ); \ + inc_t rs_a = bli_obj_row_stride( *a ); \ + inc_t cs_a = bli_obj_col_stride( *a ); \ +\ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_y = bli_obj_buffer_at_off( *y ); \ + inc_t incy = bli_obj_vector_inc( *y ); \ +\ + void* buf_alpha = bli_obj_buffer_for_1x1( dt, *alpha ); \ + void* buf_beta = bli_obj_buffer_for_1x1( dt, *beta ); \ +\ + PASTECH(ftname,_vft) f = PASTECH(opname,_vfp)[dt]; \ +\ + /* Invoke the void pointer-based function for the given datatype. */ \ + f( \ + transa, \ + conjx, \ + m, \ + n, \ + buf_alpha, \ + buf_a, rs_a, cs_a, \ + buf_x, incx, \ + buf_beta, \ + buf_y, incy, \ + cntx \ + ); \ +} \ - // Initialize the context with the current architecture's native - // level-3 gemm micro-kernel, and its output preferences. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMM_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr_prefs( BLIS_GEMM_UKR, cntx ); +GENFRONT( gemv, gemv_unb_var1 ) +GENFRONT( gemv, gemv_unb_var2 ) - // Initialize the context with the current architecture's native - // level-3 trsm micro-kernels. - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_GEMMTRSM_U_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_L_UKR, cntx ); - bli_gks_cntx_set_l3_nat_ukr( BLIS_TRSM_U_UKR, cntx ); - - // Initialize the context with the current architecture's register - // and cache blocksizes (and multiples), given the execution method. - bli_gks_cntx_set_blkszs( BLIS_NAT, 6, - BLIS_NC, BLIS_NR, - BLIS_KC, BLIS_KR, - BLIS_MC, BLIS_MR, - BLIS_NR, BLIS_NR, - BLIS_MR, BLIS_MR, - BLIS_KR, BLIS_KR, - cntx ); - - // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); -} - -void bli_trsm_cntx_finalize( cntx_t* cntx ) -{ - // Free the context and all memory allocated to it. - bli_cntx_obj_free( cntx ); -} +GENFRONT( gemv, gemv_unf_var1 ) +GENFRONT( gemv, gemv_unf_var2 ) diff --git a/frame/2/gemv/bli_gemv_blk_var1.c b/frame/2/gemv/other/bli_gemv_blk_var1.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var1.c rename to frame/2/gemv/other/bli_gemv_blk_var1.c diff --git a/frame/2/gemv/bli_gemv_blk_var2.c b/frame/2/gemv/other/bli_gemv_blk_var2.c similarity index 100% rename from frame/2/gemv/bli_gemv_blk_var2.c rename to frame/2/gemv/other/bli_gemv_blk_var2.c diff --git a/frame/2/gemv/bli_gemv_cntl.c b/frame/2/gemv/other/bli_gemv_cntl.c similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.c rename to frame/2/gemv/other/bli_gemv_cntl.c diff --git a/frame/2/gemv/bli_gemv_cntl.h b/frame/2/gemv/other/bli_gemv_cntl.h similarity index 100% rename from frame/2/gemv/bli_gemv_cntl.h rename to frame/2/gemv/other/bli_gemv_cntl.h diff --git a/frame/2/gemv/bli_gemv_front.c b/frame/2/gemv/other/bli_gemv_front.c similarity index 100% rename from frame/2/gemv/bli_gemv_front.c rename to frame/2/gemv/other/bli_gemv_front.c diff --git a/frame/2/gemv/bli_gemv_front.h b/frame/2/gemv/other/bli_gemv_front.h similarity index 100% rename from frame/2/gemv/bli_gemv_front.h rename to frame/2/gemv/other/bli_gemv_front.h diff --git a/frame/2/gemv/bli_gemv_int.c b/frame/2/gemv/other/bli_gemv_int.c similarity index 100% rename from frame/2/gemv/bli_gemv_int.c rename to frame/2/gemv/other/bli_gemv_int.c diff --git a/frame/2/gemv/bli_gemv_int.h b/frame/2/gemv/other/bli_gemv_int.h similarity index 100% rename from frame/2/gemv/bli_gemv_int.h rename to frame/2/gemv/other/bli_gemv_int.h diff --git a/frame/2/ger/bli_ger.h b/frame/2/ger/bli_ger.h index dc6f9e3f9..1d92502a3 100644 --- a/frame/2/ger/bli_ger.h +++ b/frame/2/ger/bli_ger.h @@ -32,8 +32,9 @@ */ -#include "bli_ger_cntl.h" -#include "bli_ger_front.h" -#include "bli_ger_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_ger_cntl.h" +//#include "bli_ger_front.h" +//#include "bli_ger_int.h" #include "bli_ger_var.h" diff --git a/frame/2/ger/bli_ger_var.h b/frame/2/ger/bli_ger_var.h index 5833ec3f4..98451dcae 100644 --- a/frame/2/ger/bli_ger_var.h +++ b/frame/2/ger/bli_ger_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) diff --git a/frame/2/ger/bli_ger_var_oapi.c b/frame/2/ger/bli_ger_var_oapi.c index f03452dce..5c4aa113f 100644 --- a/frame/2/ger/bli_ger_var_oapi.c +++ b/frame/2/ger/bli_ger_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ - ger_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/ger/bli_ger_blk_var1.c b/frame/2/ger/other/bli_ger_blk_var1.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var1.c rename to frame/2/ger/other/bli_ger_blk_var1.c diff --git a/frame/2/ger/bli_ger_blk_var2.c b/frame/2/ger/other/bli_ger_blk_var2.c similarity index 100% rename from frame/2/ger/bli_ger_blk_var2.c rename to frame/2/ger/other/bli_ger_blk_var2.c diff --git a/frame/2/ger/bli_ger_cntl.c b/frame/2/ger/other/bli_ger_cntl.c similarity index 100% rename from frame/2/ger/bli_ger_cntl.c rename to frame/2/ger/other/bli_ger_cntl.c diff --git a/frame/2/ger/bli_ger_cntl.h b/frame/2/ger/other/bli_ger_cntl.h similarity index 100% rename from frame/2/ger/bli_ger_cntl.h rename to frame/2/ger/other/bli_ger_cntl.h diff --git a/frame/2/ger/bli_ger_front.c b/frame/2/ger/other/bli_ger_front.c similarity index 100% rename from frame/2/ger/bli_ger_front.c rename to frame/2/ger/other/bli_ger_front.c diff --git a/frame/2/ger/bli_ger_front.h b/frame/2/ger/other/bli_ger_front.h similarity index 100% rename from frame/2/ger/bli_ger_front.h rename to frame/2/ger/other/bli_ger_front.h diff --git a/frame/2/ger/bli_ger_int.c b/frame/2/ger/other/bli_ger_int.c similarity index 100% rename from frame/2/ger/bli_ger_int.c rename to frame/2/ger/other/bli_ger_int.c diff --git a/frame/2/ger/bli_ger_int.h b/frame/2/ger/other/bli_ger_int.h similarity index 100% rename from frame/2/ger/bli_ger_int.h rename to frame/2/ger/other/bli_ger_int.h diff --git a/frame/2/hemv/bli_hemv.h b/frame/2/hemv/bli_hemv.h index 07b5ff0c0..7ac4b0b13 100644 --- a/frame/2/hemv/bli_hemv.h +++ b/frame/2/hemv/bli_hemv.h @@ -32,9 +32,10 @@ */ -#include "bli_hemv_cntl.h" -#include "bli_hemv_front.h" -#include "bli_hemv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_hemv_cntl.h" +//#include "bli_hemv_front.h" +//#include "bli_hemv_int.h" #include "bli_hemv_var.h" diff --git a/frame/2/hemv/bli_hemv_var.h b/frame/2/hemv/bli_hemv_var.h index cf0e25bd4..db00df441 100644 --- a/frame/2/hemv/bli_hemv_var.h +++ b/frame/2/hemv/bli_hemv_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) diff --git a/frame/2/hemv/bli_hemv_var_oapi.c b/frame/2/hemv/bli_hemv_var_oapi.c index c0fc00ad4..a73dbe9b3 100644 --- a/frame/2/hemv/bli_hemv_var_oapi.c +++ b/frame/2/hemv/bli_hemv_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ - hemv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/hemv/bli_hemv_blk_var1.c b/frame/2/hemv/other/bli_hemv_blk_var1.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var1.c rename to frame/2/hemv/other/bli_hemv_blk_var1.c diff --git a/frame/2/hemv/bli_hemv_blk_var2.c b/frame/2/hemv/other/bli_hemv_blk_var2.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var2.c rename to frame/2/hemv/other/bli_hemv_blk_var2.c diff --git a/frame/2/hemv/bli_hemv_blk_var3.c b/frame/2/hemv/other/bli_hemv_blk_var3.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var3.c rename to frame/2/hemv/other/bli_hemv_blk_var3.c diff --git a/frame/2/hemv/bli_hemv_blk_var4.c b/frame/2/hemv/other/bli_hemv_blk_var4.c similarity index 100% rename from frame/2/hemv/bli_hemv_blk_var4.c rename to frame/2/hemv/other/bli_hemv_blk_var4.c diff --git a/frame/2/hemv/bli_hemv_cntl.c b/frame/2/hemv/other/bli_hemv_cntl.c similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.c rename to frame/2/hemv/other/bli_hemv_cntl.c diff --git a/frame/2/hemv/bli_hemv_cntl.h b/frame/2/hemv/other/bli_hemv_cntl.h similarity index 100% rename from frame/2/hemv/bli_hemv_cntl.h rename to frame/2/hemv/other/bli_hemv_cntl.h diff --git a/frame/2/hemv/bli_hemv_front.c b/frame/2/hemv/other/bli_hemv_front.c similarity index 100% rename from frame/2/hemv/bli_hemv_front.c rename to frame/2/hemv/other/bli_hemv_front.c diff --git a/frame/2/hemv/bli_hemv_front.h b/frame/2/hemv/other/bli_hemv_front.h similarity index 100% rename from frame/2/hemv/bli_hemv_front.h rename to frame/2/hemv/other/bli_hemv_front.h diff --git a/frame/2/hemv/bli_hemv_int.c b/frame/2/hemv/other/bli_hemv_int.c similarity index 100% rename from frame/2/hemv/bli_hemv_int.c rename to frame/2/hemv/other/bli_hemv_int.c diff --git a/frame/2/hemv/bli_hemv_int.h b/frame/2/hemv/other/bli_hemv_int.h similarity index 100% rename from frame/2/hemv/bli_hemv_int.h rename to frame/2/hemv/other/bli_hemv_int.h diff --git a/frame/2/her/bli_her.h b/frame/2/her/bli_her.h index fe9d2d84e..a9a53d569 100644 --- a/frame/2/her/bli_her.h +++ b/frame/2/her/bli_her.h @@ -32,8 +32,9 @@ */ -#include "bli_her_cntl.h" -#include "bli_her_front.h" -#include "bli_her_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her_cntl.h" +//#include "bli_her_front.h" +//#include "bli_her_int.h" #include "bli_her_var.h" diff --git a/frame/2/her/bli_her_var.h b/frame/2/her/bli_her_var.h index 3e65e2bc4..d4c11a0b5 100644 --- a/frame/2/her/bli_her_var.h +++ b/frame/2/her/bli_her_var.h @@ -47,7 +47,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ); GENPROT( her_blk_var1 ) diff --git a/frame/2/her/bli_her_var_oapi.c b/frame/2/her/bli_her_var_oapi.c index a49cf62e0..3567de196 100644 --- a/frame/2/her/bli_her_var_oapi.c +++ b/frame/2/her/bli_her_var_oapi.c @@ -44,7 +44,7 @@ void PASTEMAC0(opname) \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ - her_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her/bli_her_blk_var1.c b/frame/2/her/other/bli_her_blk_var1.c similarity index 100% rename from frame/2/her/bli_her_blk_var1.c rename to frame/2/her/other/bli_her_blk_var1.c diff --git a/frame/2/her/bli_her_blk_var2.c b/frame/2/her/other/bli_her_blk_var2.c similarity index 100% rename from frame/2/her/bli_her_blk_var2.c rename to frame/2/her/other/bli_her_blk_var2.c diff --git a/frame/2/her/bli_her_cntl.c b/frame/2/her/other/bli_her_cntl.c similarity index 100% rename from frame/2/her/bli_her_cntl.c rename to frame/2/her/other/bli_her_cntl.c diff --git a/frame/2/her/bli_her_cntl.h b/frame/2/her/other/bli_her_cntl.h similarity index 100% rename from frame/2/her/bli_her_cntl.h rename to frame/2/her/other/bli_her_cntl.h diff --git a/frame/2/her/bli_her_front.c b/frame/2/her/other/bli_her_front.c similarity index 100% rename from frame/2/her/bli_her_front.c rename to frame/2/her/other/bli_her_front.c diff --git a/frame/2/her/bli_her_front.h b/frame/2/her/other/bli_her_front.h similarity index 100% rename from frame/2/her/bli_her_front.h rename to frame/2/her/other/bli_her_front.h diff --git a/frame/2/her/bli_her_int.c b/frame/2/her/other/bli_her_int.c similarity index 100% rename from frame/2/her/bli_her_int.c rename to frame/2/her/other/bli_her_int.c diff --git a/frame/2/her/bli_her_int.h b/frame/2/her/other/bli_her_int.h similarity index 100% rename from frame/2/her/bli_her_int.h rename to frame/2/her/other/bli_her_int.h diff --git a/frame/2/her2/bli_her2.h b/frame/2/her2/bli_her2.h index 273b6841e..acf55b7e2 100644 --- a/frame/2/her2/bli_her2.h +++ b/frame/2/her2/bli_her2.h @@ -32,8 +32,9 @@ */ -#include "bli_her2_cntl.h" -#include "bli_her2_front.h" -#include "bli_her2_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_her2_cntl.h" +//#include "bli_her2_front.h" +//#include "bli_her2_int.h" #include "bli_her2_var.h" diff --git a/frame/2/her2/bli_her2_var.h b/frame/2/her2/bli_her2_var.h index 301b6931e..5df14c9d1 100644 --- a/frame/2/her2/bli_her2_var.h +++ b/frame/2/her2/bli_her2_var.h @@ -49,7 +49,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) diff --git a/frame/2/her2/bli_her2_var_oapi.c b/frame/2/her2/bli_her2_var_oapi.c index 6c87496d6..ff345555e 100644 --- a/frame/2/her2/bli_her2_var_oapi.c +++ b/frame/2/her2/bli_her2_var_oapi.c @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ - her2_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *c ); \ diff --git a/frame/2/her2/bli_her2_blk_var1.c b/frame/2/her2/other/bli_her2_blk_var1.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var1.c rename to frame/2/her2/other/bli_her2_blk_var1.c diff --git a/frame/2/her2/bli_her2_blk_var2.c b/frame/2/her2/other/bli_her2_blk_var2.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var2.c rename to frame/2/her2/other/bli_her2_blk_var2.c diff --git a/frame/2/her2/bli_her2_blk_var3.c b/frame/2/her2/other/bli_her2_blk_var3.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var3.c rename to frame/2/her2/other/bli_her2_blk_var3.c diff --git a/frame/2/her2/bli_her2_blk_var4.c b/frame/2/her2/other/bli_her2_blk_var4.c similarity index 100% rename from frame/2/her2/bli_her2_blk_var4.c rename to frame/2/her2/other/bli_her2_blk_var4.c diff --git a/frame/2/her2/bli_her2_cntl.c b/frame/2/her2/other/bli_her2_cntl.c similarity index 100% rename from frame/2/her2/bli_her2_cntl.c rename to frame/2/her2/other/bli_her2_cntl.c diff --git a/frame/2/her2/bli_her2_cntl.h b/frame/2/her2/other/bli_her2_cntl.h similarity index 100% rename from frame/2/her2/bli_her2_cntl.h rename to frame/2/her2/other/bli_her2_cntl.h diff --git a/frame/2/her2/bli_her2_front.c b/frame/2/her2/other/bli_her2_front.c similarity index 100% rename from frame/2/her2/bli_her2_front.c rename to frame/2/her2/other/bli_her2_front.c diff --git a/frame/2/her2/bli_her2_front.h b/frame/2/her2/other/bli_her2_front.h similarity index 100% rename from frame/2/her2/bli_her2_front.h rename to frame/2/her2/other/bli_her2_front.h diff --git a/frame/2/her2/bli_her2_int.c b/frame/2/her2/other/bli_her2_int.c similarity index 100% rename from frame/2/her2/bli_her2_int.c rename to frame/2/her2/other/bli_her2_int.c diff --git a/frame/2/her2/bli_her2_int.h b/frame/2/her2/other/bli_her2_int.h similarity index 100% rename from frame/2/her2/bli_her2_int.h rename to frame/2/her2/other/bli_her2_int.h diff --git a/frame/2/symv/bli_symv.h b/frame/2/symv/bli_symv.h index 5195a4c50..8bb1675dc 100644 --- a/frame/2/symv/bli_symv.h +++ b/frame/2/symv/bli_symv.h @@ -32,5 +32,6 @@ */ -#include "bli_symv_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_symv_front.h" diff --git a/frame/2/symv/bli_symv_front.c b/frame/2/symv/other/bli_symv_front.c similarity index 100% rename from frame/2/symv/bli_symv_front.c rename to frame/2/symv/other/bli_symv_front.c diff --git a/frame/2/symv/bli_symv_front.h b/frame/2/symv/other/bli_symv_front.h similarity index 100% rename from frame/2/symv/bli_symv_front.h rename to frame/2/symv/other/bli_symv_front.h diff --git a/frame/2/syr/bli_syr.h b/frame/2/syr/bli_syr.h index 25a5e0a63..897ebe2c5 100644 --- a/frame/2/syr/bli_syr.h +++ b/frame/2/syr/bli_syr.h @@ -32,5 +32,6 @@ */ -#include "bli_syr_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr_front.h" diff --git a/frame/2/syr/bli_syr_front.c b/frame/2/syr/other/bli_syr_front.c similarity index 100% rename from frame/2/syr/bli_syr_front.c rename to frame/2/syr/other/bli_syr_front.c diff --git a/frame/2/syr/bli_syr_front.h b/frame/2/syr/other/bli_syr_front.h similarity index 100% rename from frame/2/syr/bli_syr_front.h rename to frame/2/syr/other/bli_syr_front.h diff --git a/frame/2/syr2/bli_syr2.h b/frame/2/syr2/bli_syr2.h index 39d45c6c5..22a9813ea 100644 --- a/frame/2/syr2/bli_syr2.h +++ b/frame/2/syr2/bli_syr2.h @@ -32,5 +32,6 @@ */ -#include "bli_syr2_front.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_syr2_front.h" diff --git a/frame/2/syr2/bli_syr2_front.c b/frame/2/syr2/other/bli_syr2_front.c similarity index 100% rename from frame/2/syr2/bli_syr2_front.c rename to frame/2/syr2/other/bli_syr2_front.c diff --git a/frame/2/syr2/bli_syr2_front.h b/frame/2/syr2/other/bli_syr2_front.h similarity index 100% rename from frame/2/syr2/bli_syr2_front.h rename to frame/2/syr2/other/bli_syr2_front.h diff --git a/frame/2/trmv/bli_trmv.h b/frame/2/trmv/bli_trmv.h index 242642a91..8410af719 100644 --- a/frame/2/trmv/bli_trmv.h +++ b/frame/2/trmv/bli_trmv.h @@ -32,9 +32,10 @@ */ -#include "bli_trmv_cntl.h" -#include "bli_trmv_front.h" -#include "bli_trmv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trmv_cntl.h" +//#include "bli_trmv_front.h" +//#include "bli_trmv_int.h" #include "bli_trmv_var.h" diff --git a/frame/2/trmv/bli_trmv_var.h b/frame/2/trmv/bli_trmv_var.h index cca3be140..23680469e 100644 --- a/frame/2/trmv/bli_trmv_var.h +++ b/frame/2/trmv/bli_trmv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) diff --git a/frame/2/trmv/bli_trmv_var_oapi.c b/frame/2/trmv/bli_trmv_var_oapi.c index 75926054b..b3c0bc147 100644 --- a/frame/2/trmv/bli_trmv_var_oapi.c +++ b/frame/2/trmv/bli_trmv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trmv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trmv/bli_trmv_cntl.c b/frame/2/trmv/other/bli_trmv_cntl.c similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.c rename to frame/2/trmv/other/bli_trmv_cntl.c diff --git a/frame/2/trmv/bli_trmv_cntl.h b/frame/2/trmv/other/bli_trmv_cntl.h similarity index 100% rename from frame/2/trmv/bli_trmv_cntl.h rename to frame/2/trmv/other/bli_trmv_cntl.h diff --git a/frame/2/trmv/bli_trmv_front.c b/frame/2/trmv/other/bli_trmv_front.c similarity index 100% rename from frame/2/trmv/bli_trmv_front.c rename to frame/2/trmv/other/bli_trmv_front.c diff --git a/frame/2/trmv/bli_trmv_front.h b/frame/2/trmv/other/bli_trmv_front.h similarity index 100% rename from frame/2/trmv/bli_trmv_front.h rename to frame/2/trmv/other/bli_trmv_front.h diff --git a/frame/2/trmv/bli_trmv_int.c b/frame/2/trmv/other/bli_trmv_int.c similarity index 100% rename from frame/2/trmv/bli_trmv_int.c rename to frame/2/trmv/other/bli_trmv_int.c diff --git a/frame/2/trmv/bli_trmv_int.h b/frame/2/trmv/other/bli_trmv_int.h similarity index 100% rename from frame/2/trmv/bli_trmv_int.h rename to frame/2/trmv/other/bli_trmv_int.h diff --git a/frame/2/trmv/bli_trmv_l_blk_var1.c b/frame/2/trmv/other/bli_trmv_l_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var1.c rename to frame/2/trmv/other/bli_trmv_l_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_l_blk_var2.c b/frame/2/trmv/other/bli_trmv_l_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_l_blk_var2.c rename to frame/2/trmv/other/bli_trmv_l_blk_var2.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var1.c b/frame/2/trmv/other/bli_trmv_u_blk_var1.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var1.c rename to frame/2/trmv/other/bli_trmv_u_blk_var1.c diff --git a/frame/2/trmv/bli_trmv_u_blk_var2.c b/frame/2/trmv/other/bli_trmv_u_blk_var2.c similarity index 100% rename from frame/2/trmv/bli_trmv_u_blk_var2.c rename to frame/2/trmv/other/bli_trmv_u_blk_var2.c diff --git a/frame/2/trsv/bli_trsv.h b/frame/2/trsv/bli_trsv.h index 7b51ed69a..9d9384422 100644 --- a/frame/2/trsv/bli_trsv.h +++ b/frame/2/trsv/bli_trsv.h @@ -32,9 +32,10 @@ */ -#include "bli_trsv_cntl.h" -#include "bli_trsv_front.h" -#include "bli_trsv_int.h" +// NOTE: level-2 control tree code is temporarily disabled. +//#include "bli_trsv_cntl.h" +//#include "bli_trsv_front.h" +//#include "bli_trsv_int.h" #include "bli_trsv_var.h" diff --git a/frame/2/trsv/bli_trsv_var.h b/frame/2/trsv/bli_trsv_var.h index bc66f49ff..395d89d5d 100644 --- a/frame/2/trsv/bli_trsv_var.h +++ b/frame/2/trsv/bli_trsv_var.h @@ -46,7 +46,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) diff --git a/frame/2/trsv/bli_trsv_var_oapi.c b/frame/2/trsv/bli_trsv_var_oapi.c index f38a5123f..e26bb3abd 100644 --- a/frame/2/trsv/bli_trsv_var_oapi.c +++ b/frame/2/trsv/bli_trsv_var_oapi.c @@ -43,7 +43,7 @@ void PASTEMAC0(opname) \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ - trsv_t* cntl \ + cntl_t* cntl \ ) \ { \ num_t dt = bli_obj_datatype( *a ); \ diff --git a/frame/2/trsv/bli_trsv_cntl.c b/frame/2/trsv/other/bli_trsv_cntl.c similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.c rename to frame/2/trsv/other/bli_trsv_cntl.c diff --git a/frame/2/trsv/bli_trsv_cntl.h b/frame/2/trsv/other/bli_trsv_cntl.h similarity index 100% rename from frame/2/trsv/bli_trsv_cntl.h rename to frame/2/trsv/other/bli_trsv_cntl.h diff --git a/frame/2/trsv/bli_trsv_front.c b/frame/2/trsv/other/bli_trsv_front.c similarity index 100% rename from frame/2/trsv/bli_trsv_front.c rename to frame/2/trsv/other/bli_trsv_front.c diff --git a/frame/2/trsv/bli_trsv_front.h b/frame/2/trsv/other/bli_trsv_front.h similarity index 100% rename from frame/2/trsv/bli_trsv_front.h rename to frame/2/trsv/other/bli_trsv_front.h diff --git a/frame/2/trsv/bli_trsv_int.c b/frame/2/trsv/other/bli_trsv_int.c similarity index 100% rename from frame/2/trsv/bli_trsv_int.c rename to frame/2/trsv/other/bli_trsv_int.c diff --git a/frame/2/trsv/bli_trsv_int.h b/frame/2/trsv/other/bli_trsv_int.h similarity index 100% rename from frame/2/trsv/bli_trsv_int.h rename to frame/2/trsv/other/bli_trsv_int.h diff --git a/frame/2/trsv/bli_trsv_l_blk_var1.c b/frame/2/trsv/other/bli_trsv_l_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var1.c rename to frame/2/trsv/other/bli_trsv_l_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_l_blk_var2.c b/frame/2/trsv/other/bli_trsv_l_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_l_blk_var2.c rename to frame/2/trsv/other/bli_trsv_l_blk_var2.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var1.c b/frame/2/trsv/other/bli_trsv_u_blk_var1.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var1.c rename to frame/2/trsv/other/bli_trsv_u_blk_var1.c diff --git a/frame/2/trsv/bli_trsv_u_blk_var2.c b/frame/2/trsv/other/bli_trsv_u_blk_var2.c similarity index 100% rename from frame/2/trsv/bli_trsv_u_blk_var2.c rename to frame/2/trsv/other/bli_trsv_u_blk_var2.c diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h index 9f17349af..ea7926d32 100644 --- a/frame/3/bli_l3.h +++ b/frame/3/bli_l3.h @@ -33,13 +33,17 @@ */ #include "bli_l3_cntx.h" +#include "bli_l3_cntl.h" #include "bli_l3_check.h" #include "bli_l3_ft.h" #include "bli_l3_oft.h" +#include "bli_l3_voft.h" #include "bli_l3_blocksize.h" +#include "bli_l3_direct.h" #include "bli_l3_prune.h" +#include "bli_l3_packm.h" // Prototype object APIs with and without contexts. #include "bli_oapi_w_cntx.h" diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 97556dedd..630cf03a5 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -35,17 +35,78 @@ #include "blis.h" +dim_t bli_l3_determine_kc + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* a, + obj_t* b, + bszid_t bszid, + cntx_t* cntx + ) +{ + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_HERK ) + return bli_herk_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRMM ) + return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + else if ( family == BLIS_TRSM ) + return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx ); + + // This should never execute. + return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); +} + +// ----------------------------------------------------------------------------- + +// +// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize +// function to determine the kc blocksize so that we can implement the +// "nudging" of kc to be a multiple of mr or nr, as needed. +// + +#undef GENFRONT +#define GENFRONT( opname, l3op ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + if ( direct == BLIS_FWD ) \ + return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \ + else \ + return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \ +} + +GENFRONT( gemm_determine_kc, gemm ) +GENFRONT( herk_determine_kc, trmm ) +GENFRONT( trmm_determine_kc, trmm ) +GENFRONT( trsm_determine_kc, trsm ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -90,6 +151,8 @@ dim_t PASTEMAC0(opname) \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -105,12 +168,64 @@ GENFRONT( gemm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ) \ +{ \ + num_t dt; \ + blksz_t* bsize; \ + dim_t b_alg, b_max; \ + dim_t b_use; \ + \ + /* bli_*_determine_kc_f(): + + We assume that this function is being called from an algorithm that + is moving "forward" (ie: top to bottom, left to right, top-left + to bottom-right). */ \ +\ + /* bli_*_determine_kc_b(): + + We assume that this function is being called from an algorithm that + is moving "backward" (ie: bottom to top, right to left, bottom-right + to top-left). */ \ +\ + /* Extract the execution datatype and use it to query the corresponding + blocksize and blocksize maximum values from the blksz_t object. */ \ + dt = bli_obj_execution_datatype( *a ); \ + bsize = bli_cntx_get_blksz( bszid, cntx ); \ + b_alg = bli_blksz_get_def( dt, bsize ); \ + b_max = bli_blksz_get_max( dt, bsize ); \ +\ + /* Notice that for herk, we do not need to perform any special handling + for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ +\ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ + b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ +\ + return b_use; \ +} + +GENFRONT( herk_determine_kc_f, f ) +GENFRONT( herk_determine_kc_b, b ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname, chdir ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -149,6 +264,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -164,12 +281,12 @@ GENFRONT( trmm_determine_kc_b, b ) \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ) \ { \ num_t dt; \ @@ -206,6 +323,8 @@ dim_t PASTEMAC0(opname) \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ + /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined + in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ @@ -214,282 +333,3 @@ dim_t PASTEMAC0(opname) \ GENFRONT( trsm_determine_kc_f, f ) GENFRONT( trsm_determine_kc_b, b ) - - - - - - - - - -#if 0 -dim_t bli_gemm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_gemm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if A is Hermitian or symmetric, or NR if B is - // Hermitian or symmetric. If neither case applies, then we leave - // the blocksizes unchanged. - if ( bli_obj_root_is_herm_or_symm( *a ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - else if ( bli_obj_root_is_herm_or_symm( *b ) ) - { - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - } - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trmm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trmm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* a, - obj_t* b, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *a ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR if the triangular matrix is on the left, or NR - // if the triangular matrix is one the right. - if ( bli_obj_root_is_triangular( *a ) ) - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - else - mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); - - b_alg = bli_align_dim_to_mult( b_alg, mnr ); - b_max = bli_align_dim_to_mult( b_max, mnr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -// ----------------------------------------------------------------------------- - -dim_t bli_trsm_determine_kc_f - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "forward" (ie: top to bottom, left to right, top-left - // to bottom-right). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_trsm_determine_kc_b - ( - dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx - ) -{ - num_t dt; - blksz_t* bsize; - dim_t mnr; - dim_t b_alg, b_max; - dim_t b_use; - - // We assume that this function is being called from an algorithm that - // is moving "backward" (ie: bottom to top, right to left, bottom-right - // to top-left). - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - // Nudge the default and maximum kc blocksizes up to the nearest - // multiple of MR. We always use MR (rather than sometimes using NR) - // because even when the triangle is on the right, packing of that - // matrix uses MR, since only left-side trsm micro-kernels are - // supported. - mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); - b_alg = bli_align_dim_to_mult( b_alg, mr ); - b_max = bli_align_dim_to_mult( b_max, mr ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -#endif diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h index 01e10c3fe..8f9f7ad80 100644 --- a/frame/3/bli_l3_blocksize.h +++ b/frame/3/bli_l3_blocksize.h @@ -38,17 +38,42 @@ \ dim_t PASTEMAC0(opname) \ ( \ - dim_t i, \ - dim_t dim, \ - obj_t* a, \ - obj_t* b, \ - bszid_t bszid, \ - cntx_t* cntx \ + dir_t direct, \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ + ); + +GENPROT( l3_determine_kc ) + +GENPROT( gemm_determine_kc ) +GENPROT( herk_determine_kc ) +GENPROT( trmm_determine_kc ) +GENPROT( trsm_determine_kc ) + + +#undef GENPROT +#define GENPROT( opname ) \ +\ +dim_t PASTEMAC0(opname) \ + ( \ + dim_t i, \ + dim_t dim, \ + obj_t* a, \ + obj_t* b, \ + bszid_t bszid, \ + cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) +GENPROT( herk_determine_kc_f ) +GENPROT( herk_determine_kc_b ) + GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c index 48249a9b3..e901f2766 100644 --- a/frame/3/bli_l3_check.c +++ b/frame/3/bli_l3_check.c @@ -226,29 +226,6 @@ void bli_syr2k_check bli_check_error_code( e_val ); } -#if 0 -void bli_trmm_check - ( - side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx - ) -{ - err_t e_val; - - // Perform checks common to hemm/symm. - - bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); - - // Check object structure. - - e_val = bli_check_triangular_object( a ); - bli_check_error_code( e_val ); -} -#endif - void bli_trmm_check ( side_t side, diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c new file mode 100644 index 000000000..a8dfee1ba --- /dev/null +++ b/frame/3/bli_l3_cntl.c @@ -0,0 +1,114 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + + +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ) +{ + // If the control tree pointer is NULL, we construct a default + // tree as a function of the operation family. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + *cntl_use = bli_gemm_cntl_create( family ); + } + else // if ( family == BLIS_TRSM ) + { + side_t side; + + if ( bli_obj_is_triangular( *a ) ) side = BLIS_LEFT; + else side = BLIS_RIGHT; + + *cntl_use = bli_trsm_cntl_create( side ); + } + } + else + { + // If the user provided a control tree, create a copy and use it + // instead (so that it can be used to cache things like pack mem_t + // entries). + *cntl_use = bli_cntl_copy( cntl_orig ); + } +} + +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ) +{ + // If the control tree pointer is NULL, a default tree would have + // been created, so we now must free it. + if ( cntl_orig == NULL ) + { + opid_t family = bli_cntx_get_family( cntx ); + + if ( family == BLIS_GEMM || + family == BLIS_HERK || + family == BLIS_TRMM ) + { + bli_gemm_cntl_free( cntl_use, thread ); + } + else // if ( family == BLIS_TRSM ) + { + bli_trsm_cntl_free( cntl_use, thread ); + } + } + else + { + // If the user provided a control tree, free the copy of it that + // was created. + bli_cntl_free( cntl_use, thread ); + } +} + diff --git a/frame/include/bli_malloc_prototypes.h b/frame/3/bli_l3_cntl.h similarity index 79% rename from frame/include/bli_malloc_prototypes.h rename to frame/3/bli_l3_cntl.h index e828f99aa..dc0aeb869 100644 --- a/frame/include/bli_malloc_prototypes.h +++ b/frame/3/bli_l3_cntl.h @@ -32,19 +32,29 @@ */ -#ifndef BLIS_MALLOC_PROTOTYPES_H -#define BLIS_MALLOC_PROTOTYPES_H -// Generate prototypes for each of the malloc() and free() functions -// defined in BLIS +// +// Prototype conditional control tree creation functions. +// -void* BLIS_MALLOC_POOL( size_t size ); -void BLIS_FREE_POOL( void* p ); +void bli_l3_cntl_create_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t** cntl_use + ); -void* BLIS_MALLOC_INTL( size_t size ); -void BLIS_FREE_INTL( void* p ); +void bli_l3_cntl_free_if + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl_orig, + cntl_t* cntl_use, + thrinfo_t* thread + ); -void* BLIS_MALLOC_USER( size_t size ); -void BLIS_FREE_USER( void* p ); - -#endif diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c new file mode 100644 index 000000000..993501541 --- /dev/null +++ b/frame/3/bli_l3_direct.c @@ -0,0 +1,140 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); + else if ( family == BLIS_HERK ) return bli_herk_direct( a, b, c ); + else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); + else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); + + // This should never execute. + return BLIS_FWD; +} + +// ----------------------------------------------------------------------------- + +dir_t bli_gemm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + // For gemm, movement may be forwards (or backwards). + + return BLIS_FWD; +} + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + // For herk, movement may be forwards (or backwards). + + return BLIS_FWD; +} + +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trmm, movement for the parameter cases is as follows: + // - left,lower: backwards + // - left,upper: forwards + // - right,lower: forwards + // - right,upper: backwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + // For trsm, movement for the parameter cases is as follows: + // - left,lower: forwards + // - left,upper: backwards + // - right,lower: backwards + // - right,upper: forwards + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; +} + diff --git a/frame/3/herk/old/bli_herk_thread.h b/frame/3/bli_l3_direct.h similarity index 79% rename from frame/3/herk/old/bli_herk_thread.h rename to frame/3/bli_l3_direct.h index 1feafd113..7b88ba51f 100644 --- a/frame/3/herk/old/bli_herk_thread.h +++ b/frame/3/bli_l3_direct.h @@ -32,13 +32,28 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +dir_t bli_l3_direct + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ); -// For use in herk micro-kernel -#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way ) -#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way ) +// ----------------------------------------------------------------------------- -//thrinfo_t** bli_herk_thrinfo_create_paths( void ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +dir_t PASTEMAC0(opname) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ); + +GENPROT( gemm_direct ) +GENPROT( herk_direct ) +GENPROT( trmm_direct ) +GENPROT( trsm_direct ) diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c new file mode 100644 index 000000000..28fb1f857 --- /dev/null +++ b/frame/3/bli_l3_packm.c @@ -0,0 +1,179 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_l3_packm + ( + obj_t* x, + obj_t* x_pack, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + membrk_t* membrk; + packbuf_t pack_buf_type; + mem_t* cntl_mem_p; + siz_t size_needed; + + // FGVZ: Not sure why we need this barrier, but we do. + bli_thread_obarrier( thread ); + + // Every thread initializes x_pack and determines the size of memory + // block needed (which gets embedded into the otherwise "blank" mem_t + // entry in the control tree node). + size_needed + = + bli_packm_init + ( + x, + x_pack, + cntx, + cntl + ); + + // If zero was returned, no memory needs to be allocated and so we can + // return early. + if ( size_needed == 0 ) return; + + // Query the memory broker from the context. + membrk = bli_cntx_get_membrk( cntx ); + + // Query the pack buffer type from the control tree node. + pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + + // Query the address of the mem_t entry within the control tree node. + cntl_mem_p = bli_cntl_pack_mem( cntl ); + + // Check the mem_t field in the control tree. If it is unallocated, then + // we need to acquire a block from the memory broker and broadcast it to + // all threads in the chief's thread group. + if ( bli_mem_is_unalloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread acquires a block from the memory broker + // and saves the associated mem_t entry to local_mem_s. + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the contents of the chief thread's local mem_t entry to the + // mem_t field in this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else // ( bli_mem_is_alloc( cntl_mem_p ) ) + { + mem_t* local_mem_p; + mem_t local_mem_s; + + // If the mem_t entry in the control tree does NOT contain a NULL + // buffer, then a block has already been acquired from the memory + // broker and cached in the control tree. + + // BUT, we need to make sure that the mem_t object is not associated + // with a block that is too small given the size of the packed matrix + // that we need, according to the return value from packm_init(). + siz_t cntl_mem_size = bli_mem_size( cntl_mem_p ); + + if ( size_needed < cntl_mem_size ) + { + if ( bli_thread_am_ochief( thread ) ) + { + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + bli_membrk_release( cntl_mem_p ); + bli_membrk_acquire_m + ( + membrk, + size_needed, + pack_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *cntl_mem_p = *local_mem_p; + } + else + { + // If the mem_t entry is already allocated and sufficiently large, + // then we use it as-is. No action is needed, because all threads + // will already have the cached values in their local control + // trees' mem_t entries, currently pointed to by cntl_mem_p. + + bli_thread_obarrier( thread ); + } + } + + + // Update the buffer address in x_pack to point to the buffer associated + // with the mem_t entry acquired from the memory broker (now cached in + // the control tree node). + bli_obj_set_buffer_to_mem( cntl_mem_p, *x_pack ); + + + // Pack the contents of object x to object x_pack. + bli_packm_int + ( + x, + x_pack, + cntx, + cntl, + thread + ); + + // Barrier so that packing is done before computation. + bli_thread_obarrier( thread ); +} + diff --git a/frame/3/trsm/old/bli_trsm_blk_var2f.h b/frame/3/bli_l3_packm.h similarity index 89% rename from frame/3/trsm/old/bli_trsm_blk_var2f.h rename to frame/3/bli_l3_packm.h index 8b5d2dd7e..7dc5dfb46 100644 --- a/frame/3/trsm/old/bli_trsm_blk_var2f.h +++ b/frame/3/bli_l3_packm.h @@ -32,9 +32,14 @@ */ -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); +#include "blis.h" + +void bli_l3_packm + ( + obj_t* x, + obj_t* x_pack, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c index a8c853c56..f908bbb64 100644 --- a/frame/3/bli_l3_prune.c +++ b/frame/3/bli_l3_prune.c @@ -34,6 +34,86 @@ #include "blis.h" +/* +void bli_l3_prune_unref_mparts_m + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx + ) +{ + // Query the operation family. + opid_t family = bli_cntx_family( cntx ); + + if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. + else if ( family == BLIS_HERK ) bli_herk_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); + else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); +} +*/ + +#undef GENFRONT +#define GENFRONT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ) \ +{ \ + /* Query the operation family. */ \ + opid_t family = bli_cntx_family( cntx ); \ +\ + if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ + else if ( family == BLIS_HERK ) PASTEMAC(herk_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ + else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ +} + +GENFRONT( m ) +GENFRONT( n ) +GENFRONT( k ) + +// ----------------------------------------------------------------------------- + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC(opname,_prune_unref_mparts_m) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_n) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} \ +void PASTEMAC(opname,_prune_unref_mparts_k) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c \ + ) \ +{ \ + /* No pruning is necessary for gemm. */ \ +} + +GENFRONT( gemm ) + +// ----------------------------------------------------------------------------- + #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h index b4870407d..13d661ff1 100644 --- a/frame/3/bli_l3_prune.h +++ b/frame/3/bli_l3_prune.h @@ -33,6 +33,23 @@ */ +#undef GENPROT +#define GENPROT( dim ) \ +\ +void PASTEMAC(l3_prune_unref_mparts_,dim) \ + ( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx \ + ); + +GENPROT( m ) +GENPROT( n ) +GENPROT( k ) + +// ----------------------------------------------------------------------------- + #undef GENPROT #define GENPROT( opname, dim ) \ \ @@ -43,6 +60,10 @@ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ obj_t* c \ ); +GENPROT( gemm, m ) +GENPROT( gemm, n ) +GENPROT( gemm, k ) + GENPROT( herk, m ) GENPROT( herk, n ) GENPROT( herk, k ) diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c index 0bea43e9d..33027a1e8 100644 --- a/frame/3/bli_l3_thrinfo.c +++ b/frame/3/bli_l3_thrinfo.c @@ -35,55 +35,45 @@ #include "blis.h" #include "assert.h" +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { return bli_thrinfo_create ( ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ) { bli_thrinfo_init ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + TRUE, + sub_node ); } @@ -101,124 +91,235 @@ void bli_l3_thrinfo_free ) { if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED + thread == &BLIS_PACKM_SINGLE_THREADED || + thread == &BLIS_GEMM_SINGLE_THREADED ) return; - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); + thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); + // Free the communicators, but only if the current thrinfo_t struct + // is marked as needing them to be freed. The most common example of + // thrinfo_t nodes NOT marked as needing their comms freed are those + // associated with packm thrinfo_t nodes. + if ( bli_thrinfo_needs_free_comm( thread ) ) + { + // The ochief always frees his communicator, and the ichief free its + // communicator if we are at the leaf node. + if ( bli_thread_am_ochief( thread ) ) + bli_thrcomm_free( bli_thrinfo_ocomm( thread ) ); + } + + // Free all children of the current thrinfo_t. + bli_l3_thrinfo_free( thrinfo_sub_node ); + + // Free the thrinfo_t struct. bli_free_intl( thread ); } // ----------------------------------------------------------------------------- -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ) { - dim_t jc_in, jc_way; - dim_t kc_in, kc_way; - dim_t ic_in, ic_way; - dim_t jr_in, jr_way; - dim_t ir_in, ir_way; + // Query the global communicator for the total number of threads to use. + dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); -#ifdef BLIS_ENABLE_MULTITHREADING - jc_in = bli_env_read_nway( "BLIS_JC_NT" ); - //kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - kc_in = 1; - ic_in = bli_env_read_nway( "BLIS_IC_NT" ); - jr_in = bli_env_read_nway( "BLIS_JR_NT" ); - ir_in = bli_env_read_nway( "BLIS_IR_NT" ); -#else - jc_in = 1; - kc_in = 1; - ic_in = 1; - jr_in = 1; - ir_in = 1; -#endif + // Use the thread id passed in as the global communicator id. + dim_t gl_comm_id = id; - if ( l3_op == BLIS_TRMM ) - { - // We reconfigure the parallelism for trmm_r due to a dependency in - // the jc loop. (NOTE: This dependency does not exist for trmm3.) - if ( bli_is_right( side ) ) - { - jc_way = 1; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in * jc_in; - ir_way = ir_in; - } - else // if ( bli_is_left( side ) ) - { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; - } - } - else if ( l3_op == BLIS_TRSM ) - { - if ( bli_is_right( side ) ) - { + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); - jc_way = 1; - kc_way = 1; - ic_way = jc_in * ic_in * jr_in; - jr_way = 1; - ir_way = 1; - } - else // if ( bli_is_left( side ) ) - { - jc_way = 1; - kc_way = 1; - ic_way = 1; - jr_way = ic_in * jr_in * ir_in; - ir_way = 1; - } - } - else // all other level-3 operations + // Determine the work id for this thrinfo_t node. + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + // Create the root thrinfo_t node. + *thread = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); +} + +// ----------------------------------------------------------------------------- + +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ) +{ + dim_t n_threads = bli_thread_num_threads( threads[0] ); + dim_t gl_comm_id; + + thrinfo_t* jc_info = threads[0]; + thrinfo_t* pc_info = bli_thrinfo_sub_node( jc_info ); + thrinfo_t* pb_info = bli_thrinfo_sub_node( pc_info ); + thrinfo_t* ic_info = bli_thrinfo_sub_node( pb_info ); + thrinfo_t* pa_info = bli_thrinfo_sub_node( ic_info ); + thrinfo_t* jr_info = bli_thrinfo_sub_node( pa_info ); + thrinfo_t* ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t jc_way = bli_thread_n_way( jc_info ); + dim_t pc_way = bli_thread_n_way( pc_info ); + dim_t pb_way = bli_thread_n_way( pb_info ); + dim_t ic_way = bli_thread_n_way( ic_info ); + dim_t pa_way = bli_thread_n_way( pa_info ); + dim_t jr_way = bli_thread_n_way( jr_info ); + dim_t ir_way = bli_thread_n_way( ir_info ); + + dim_t gl_nt = bli_thread_num_threads( jc_info ); + dim_t jc_nt = bli_thread_num_threads( pc_info ); + dim_t pc_nt = bli_thread_num_threads( pb_info ); + dim_t pb_nt = bli_thread_num_threads( ic_info ); + dim_t ic_nt = bli_thread_num_threads( pa_info ); + dim_t pa_nt = bli_thread_num_threads( jr_info ); + dim_t jr_nt = bli_thread_num_threads( ir_info ); + + printf( " gl jc kc pb ic pa jr ir\n" ); + printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + gl_nt, jc_nt, pc_nt, pb_nt, ic_nt, pa_nt, jr_nt, (dim_t)1 ); + printf( "\n" ); + printf( " jc kc pb ic pa jr ir\n" ); + printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", + jc_way, pc_way, pb_way, ic_way, pa_way, jr_way, ir_way ); + printf( "=================================================\n" ); + + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) { - jc_way = jc_in; - kc_way = kc_in; - ic_way = ic_in; - jr_way = jr_in; - ir_way = ir_in; + jc_info = threads[gl_comm_id]; + pc_info = bli_thrinfo_sub_node( jc_info ); + pb_info = bli_thrinfo_sub_node( pc_info ); + ic_info = bli_thrinfo_sub_node( pb_info ); + pa_info = bli_thrinfo_sub_node( ic_info ); + jr_info = bli_thrinfo_sub_node( pa_info ); + ir_info = bli_thrinfo_sub_node( jr_info ); + + dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); + dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); + dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); + dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); + dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); + dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); + dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + + dim_t jc_work_id = bli_thread_work_id( jc_info ); + dim_t pc_work_id = bli_thread_work_id( pc_info ); + dim_t pb_work_id = bli_thread_work_id( pb_info ); + dim_t ic_work_id = bli_thread_work_id( ic_info ); + dim_t pa_work_id = bli_thread_work_id( pa_info ); + dim_t jr_work_id = bli_thread_work_id( jr_info ); + dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "---------------------------------------\n" ); } +} - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); +// ----------------------------------------------------------------------------- - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ) +{ + // Query the context for the total number of threads to use. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Create a global thread communicator for all the threads. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // Allocate an array of thrinfo_t pointers, one for each thread. + thrinfo_t** paths = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); + + // Use the blocksize id of the current (root) control tree node to + // query the top-most ways of parallelism to obtain. + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t xx_way = bli_cntx_way_for_bszid( bszid, cntx ); + + dim_t gl_comm_id; + + // Create one thrinfo_t node for each thread in the (global) communicator. + for ( gl_comm_id = 0; gl_comm_id < n_threads; ++gl_comm_id ) + { + dim_t work_id = gl_comm_id / ( n_threads / xx_way ); + + paths[ gl_comm_id ] = bli_thrinfo_create + ( + gl_comm, + gl_comm_id, + xx_way, + work_id, + TRUE, + NULL + ); + } + + return paths; +} + +//#define PRINT_THRINFO + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx + ) +{ + dim_t jc_way = bli_cntx_jc_way( cntx ); + dim_t pc_way = bli_cntx_pc_way( cntx ); + dim_t ic_way = bli_cntx_ic_way( cntx ); + dim_t jr_way = bli_cntx_jr_way( cntx ); + dim_t ir_way = bli_cntx_ir_way( cntx ); + + dim_t gl_nt = jc_way * pc_way * ic_way * jr_way * ir_way; + dim_t jc_nt = pc_way * ic_way * jr_way * ir_way; + dim_t pc_nt = ic_way * jr_way * ir_way; dim_t ic_nt = jr_way * ir_way; dim_t jr_nt = ir_way; dim_t ir_nt = 1; + assert( gl_nt != 0 ); - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); +#ifdef PRINT_THRINFO +printf( " gl jc kc pb ic pa jr ir\n" ); +printf( "xx_nt: %4lu %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_nt, jc_nt, pc_nt, pc_nt, ic_nt, ic_nt, jr_nt, ir_nt ); +printf( "\n" ); +printf( " jc kc pb ic pa jr ir\n" ); +printf( "xx_way: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +jc_way, pc_way, (dim_t)0, ic_way, (dim_t)0, jr_way, ir_way ); +printf( "=================================================\n" ); +#endif - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); + thrinfo_t** paths = bli_malloc_intl( gl_nt * sizeof( thrinfo_t* ) ); + + thrcomm_t* gl_comm = bli_thrcomm_create( gl_nt ); for( int a = 0; a < jc_way; a++ ) { thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) + for( int b = 0; b < pc_way; b++ ) { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); + thrcomm_t* pc_comm = bli_thrcomm_create( pc_nt ); for( int c = 0; c < ic_way; c++ ) { @@ -230,105 +331,108 @@ thrinfo_t** bli_l3_thrinfo_create_paths for( int e = 0; e < ir_way; e++ ) { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; + //thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); + dim_t ir_comm_id = 0; + dim_t jr_comm_id = e*ir_nt + ir_comm_id; + dim_t ic_comm_id = d*jr_nt + jr_comm_id; + dim_t pc_comm_id = c*ic_nt + ic_comm_id; + dim_t jc_comm_id = b*pc_nt + pc_comm_id; + dim_t gl_comm_id = a*jc_nt + jc_comm_id; - // Macrokernel loops + // macro-kernel loops thrinfo_t* ir_info = bli_l3_thrinfo_create( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, ir_way, e, - NULL, NULL, NULL ); - + NULL ); thrinfo_t* jr_info = bli_l3_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, jr_way, d, - NULL, NULL, ir_info ); - //blk_var_1 - thrinfo_t* pack_ic_in + ir_info ); + // packa + thrinfo_t* pa_info = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - thrinfo_t* pack_ic_out - = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - + ic_nt, ic_comm_id, + jr_info ); + // blk_var1 thrinfo_t* ic_info = - bli_l3_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, + bli_l3_thrinfo_create( pc_comm, pc_comm_id, ic_way, c, - pack_ic_out, pack_ic_in, jr_info ); - //blk_var_3 - thrinfo_t* pack_kc_in + pa_info ); + // packb + thrinfo_t* pb_info = - bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* pack_kc_out - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info + bli_packm_thrinfo_create( pc_comm, pc_comm_id, + pc_nt, pc_comm_id, + ic_info ); + // blk_var3 + thrinfo_t* pc_info = bli_l3_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info ); - //blk_var_2 - thrinfo_t* pack_jc_in - = - bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* pack_jc_out - = - bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - + pc_way, b, + pb_info ); + // blk_var2 thrinfo_t* jc_info = - bli_l3_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, + bli_l3_thrinfo_create( gl_comm, gl_comm_id, jc_way, a, - pack_jc_out, pack_jc_in, kc_info ); + pc_info ); + + paths[gl_comm_id] = jc_info; + +#ifdef PRINT_THRINFO +{ +dim_t gl_comm_id = bli_thread_ocomm_id( jc_info ); +dim_t jc_comm_id = bli_thread_ocomm_id( pc_info ); +dim_t pc_comm_id = bli_thread_ocomm_id( pb_info ); +dim_t pb_comm_id = bli_thread_ocomm_id( ic_info ); +dim_t ic_comm_id = bli_thread_ocomm_id( pa_info ); +dim_t pa_comm_id = bli_thread_ocomm_id( jr_info ); +dim_t jr_comm_id = bli_thread_ocomm_id( ir_info ); + +dim_t jc_work_id = bli_thread_work_id( jc_info ); +dim_t pc_work_id = bli_thread_work_id( pc_info ); +dim_t pb_work_id = bli_thread_work_id( pb_info ); +dim_t ic_work_id = bli_thread_work_id( ic_info ); +dim_t pa_work_id = bli_thread_work_id( pa_info ); +dim_t jr_work_id = bli_thread_work_id( jr_info ); +dim_t ir_work_id = bli_thread_work_id( ir_info ); + +printf( " gl jc pb kc pa ic jr \n" ); +printf( "comm ids: %4lu %4lu %4lu %4lu %4lu %4lu %4lu\n", +gl_comm_id, jc_comm_id, pc_comm_id, pb_comm_id, ic_comm_id, pa_comm_id, jr_comm_id ); +printf( "work ids: %4ld %4ld %4lu %4lu %4ld %4ld %4ld\n", +jc_work_id, pc_work_id, pb_work_id, ic_work_id, pa_work_id, jr_work_id, ir_work_id ); +printf( "-------------------------------------------------\n" ); +} +#endif - paths[global_comm_id] = jc_info; } } } } } +#ifdef PRINT_THRINFO +exit(1); +#endif + return paths; } +#endif void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ) { + dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t i; - for ( i = 0; i < num; ++i ) + for ( i = 0; i < n_threads; ++i ) bli_l3_thrinfo_free( threads[i] ); bli_free_intl( threads ); diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index 887fc9900..fcf1f507d 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -61,31 +61,25 @@ // thrinfo_t APIs specific to level-3 operations. // +#if 0 thrinfo_t* bli_l3_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); +#endif void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single @@ -100,15 +94,37 @@ void bli_l3_thrinfo_free // ----------------------------------------------------------------------------- -thrinfo_t** bli_l3_thrinfo_create_paths +void bli_l3_thrinfo_create_root ( - opid_t l3_op, - side_t side + dim_t id, + thrcomm_t* gl_comm, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t** thread ); +void bli_l3_thrinfo_print_paths + ( + thrinfo_t** threads + ); + +// ----------------------------------------------------------------------------- + +#if 0 +thrinfo_t** bli_l3_thrinfo_create_roots + ( + cntx_t* cntx, + cntl_t* cntl + ); + +thrinfo_t** bli_l3_thrinfo_create_full_paths + ( + cntx_t* cntx + ); +#endif + void bli_l3_thrinfo_free_paths ( - thrinfo_t** threads, - dim_t num + thrinfo_t** threads ); diff --git a/frame/3/bli_l3_var_oft.h b/frame/3/bli_l3_var_oft.h new file mode 100644 index 000000000..ef48d5e85 --- /dev/null +++ b/frame/3/bli_l3_var_oft.h @@ -0,0 +1,77 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + gemm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + trsm_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif + diff --git a/frame/3/bli_l3_voft.h b/frame/3/bli_l3_voft.h new file mode 100644 index 000000000..52210f172 --- /dev/null +++ b/frame/3/bli_l3_voft.h @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_L3_VAR_OFT_H +#define BLIS_L3_VAR_OFT_H + + +// +// -- Level-3 variant function types ------------------------------------------- +// + +#undef GENTDEF +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( gemm ) + + +#define GENTDEF( opname ) \ +\ +typedef void (*PASTECH(opname,_voft)) \ +( \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntx_t* cntx, \ + cntl_t* cntl, \ + thrinfo_t* thread \ +); + +GENTDEF( trsm ) + + + +#endif + diff --git a/frame/cntl/bli_cntl_init.c b/frame/3/gemm/bli_gemm_blk_var1.c similarity index 56% rename from frame/cntl/bli_cntl_init.c rename to frame/3/gemm/bli_gemm_blk_var1.c index b7c53ec65..1a5693d8c 100644 --- a/frame/cntl/bli_cntl_init.c +++ b/frame/3/gemm/bli_gemm_blk_var1.c @@ -34,71 +34,62 @@ #include "blis.h" -static bool_t bli_cntl_is_init = FALSE; - -void bli_cntl_init( void ) +void bli_gemm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - // If the API is already initialized, return early. - if ( bli_cntl_is_initialized() ) return; + obj_t a1, c1; - // Level-1 - bli_scalv_cntl_init(); - bli_packv_cntl_init(); - bli_unpackv_cntl_init(); + dir_t direct; - // Level-1m - bli_scalm_cntl_init(); - bli_packm_cntl_init(); - bli_unpackm_cntl_init(); + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; - // Level-2 - bli_gemv_cntl_init(); - bli_ger_cntl_init(); - bli_hemv_cntl_init(); - bli_her_cntl_init(); - bli_her2_cntl_init(); - bli_trmv_cntl_init(); - bli_trsv_cntl_init(); + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); - // Level-3 - bli_gemm_cntl_init(); - bli_trsm_cntl_init(); + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); - // Mark API as initialized. - bli_cntl_is_init = TRUE; -} - -void bli_cntl_finalize( void ) -{ - // Level-1 - bli_scalv_cntl_finalize(); - bli_packv_cntl_finalize(); - bli_unpackv_cntl_finalize(); - - // Level-1m - bli_scalm_cntl_finalize(); - bli_packm_cntl_finalize(); - bli_unpackm_cntl_finalize(); - - // Level-2 - bli_gemv_cntl_finalize(); - bli_ger_cntl_finalize(); - bli_hemv_cntl_finalize(); - bli_her_cntl_finalize(); - bli_her2_cntl_finalize(); - bli_trmv_cntl_finalize(); - bli_trsv_cntl_finalize(); - - // Level-3 - bli_gemm_cntl_finalize(); - bli_trsm_cntl_finalize(); - - // Mark API as uninitialized. - bli_cntl_is_init = FALSE; -} - -bool_t bli_cntl_is_initialized( void ) -{ - return bli_cntl_is_init; + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } } diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c deleted file mode 100644 index ee4a6a763..000000000 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c new file mode 100644 index 000000000..a65f8a20a --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var2.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c deleted file mode 100644 index f44951a20..000000000 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of b (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c new file mode 100644 index 000000000..0148428df --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is a locally aliased obj_t (see _int() function), we + // can simply overwrite the internal beta scalar with BLIS_ONE once + // it has been used in the first iteration. However... + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that the beta that is passed into the + // macro-kernel must be zero, since the macro-kernel only applies that + // beta to (and thus overwrites) the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern holds for trmm3 as well--except there, the beta scalar + // is potentially non-zero, but is still applied only to the current + // row-panel of C, and thus beta is applied to all of C exactly once. + // Thus, for neither trmm nor trmm3 should we reset the scalar on C + // after the first iteration. + if ( bli_cntx_get_family( cntx ) != BLIS_TRMM ) + if ( i == 0 ) bli_obj_scalar_reset( c ); + } +} + diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c deleted file mode 100644 index 073760900..000000000 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - if( bli_thread_am_ochief( thread ) ){ - // Initialize object for packing C - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize pack objects for A and B that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ){ - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: We call a gemm/hemm/symm-specific function to determine - // the kc blocksize so that we can implement the "nudging" of kc - // to be a multiple of mr or nr, as needed. - b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem. - bli_gemm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 382b82bbd..b3494b174 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -34,140 +34,108 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -packm_t* gemm_packa_cntl = NULL; -packm_t* gemm_packb_cntl = NULL; - -gemm_t* gemm_cntl_bp_ke = NULL; -gemm_t* gemm_cntl_op_bp = NULL; -gemm_t* gemm_cntl_mm_op = NULL; -gemm_t* gemm_cntl_vl_mm = NULL; - -gemm_t* gemm_cntl = NULL; - -void bli_gemm_cntl_init() +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ) { - // Create control tree objects for packm operations. - gemm_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MR, - BLIS_KR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - gemm_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_KR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); + void* macro_kernel_p = bli_gemm_ker_var2; - // - // Create a control tree for packing A and B, and streaming C. - // + // Change the macro-kernel if the operation family is herk or trmm. + if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; - // Create control tree object for lowest-level block-panel kernel. - gemm_cntl_bp_ke - = - bli_gemm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, - NULL, NULL, NULL ); + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); - // Create control tree object for outer panel (to block-panel) - // problem. - gemm_cntl_op_bp - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - gemm_packa_cntl, - gemm_packb_cntl, - NULL, - gemm_cntl_bp_ke, - NULL ); + cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_bu_ke + ); - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates. - gemm_cntl_mm_op - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_op_bp, - NULL ); + // Create a node for packing matrix A. + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_bp_bu + ); - // Create control tree object for very large problem via multiple - // general problems. - gemm_cntl_vl_mm - = - bli_gemm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - gemm_cntl_mm_op, - NULL ); + // Create a node for partitioning the m dimension by MC. + cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var1, + gemm_cntl_packa + ); - // Alias the "master" gemm control tree to a shorter name. - gemm_cntl = gemm_cntl_vl_mm; + // Create a node for packing matrix B. + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, + bli_packm_blk_var1, + BLIS_KR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var2, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; } -void bli_gemm_cntl_finalize() +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) { - bli_cntl_obj_free( gemm_packa_cntl ); - bli_cntl_obj_free( gemm_packb_cntl ); - - bli_cntl_obj_free( gemm_cntl_bp_ke ); - bli_cntl_obj_free( gemm_cntl_op_bp ); - bli_cntl_obj_free( gemm_cntl_mm_op ); - bli_cntl_obj_free( gemm_cntl_vl_mm ); + bli_cntl_free( cntl, thread ); } -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) { - gemm_t* cntl; - - cntl = ( gemm_t* ) bli_malloc_intl( sizeof(gemm_t) ); - - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; - - return cntl; + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 507a1dd14..5b985327c 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -32,31 +32,23 @@ */ -struct gemm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct gemm_s gemm_t; +cntl_t* bli_gemm_cntl_create + ( + opid_t family + ); -#define bli_cntl_sub_gemm( cntl ) cntl->sub_gemm +void bli_gemm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); -void bli_gemm_cntl_init( void ); -void bli_gemm_cntl_finalize( void ); -gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 47b5573c4..533a6dcaf 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -59,7 +62,7 @@ void bli_gemm_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -70,7 +73,7 @@ void bli_gemm_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, b_local ); @@ -79,22 +82,23 @@ void bli_gemm_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx ); + // Invoke the internal back-end via the thread handler. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h index 0176eef37..9f11f61d4 100644 --- a/frame/3/gemm/bli_gemm_front.h +++ b/frame/3/gemm/bli_gemm_front.h @@ -32,11 +32,13 @@ */ -void bli_gemm_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_gemm_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 171f2d6f1..b24f2a25d 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -34,42 +34,22 @@ #include "blis.h" -#define FUNCPTR_T gemm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[6][3] = -{ - // unblocked optimized unblocked blocked - { NULL, NULL, bli_gemm_blk_var1f }, - { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2f }, - { NULL, NULL, bli_gemm_blk_var3f }, - { NULL, NULL, NULL, }, - { NULL, NULL, NULL }, - { NULL, NULL, NULL } -}; - -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; - varnum_t n; - impl_t i; - FUNCPTR_T f; - ind_t im; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -82,7 +62,7 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; @@ -93,32 +73,20 @@ void bli_gemm_int( obj_t* alpha, if ( bli_obj_is_zeros( *a ) || bli_obj_is_zeros( *b ) ) { - if( bli_thread_am_ochief( thread ) ) + // This should never execute. + bli_abort(); + + if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_obarrier( thread ); return; } - // Alias A and B in case we need to update attached scalars. + // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( *a, a_local ); bli_obj_alias_to( *b, b_local ); - - // Alias C in case we need to induce a transposition. bli_obj_alias_to( *c, c_local ); - // If we are about to call a leaf-level implementation, and matrix C - // still needs a transposition, then we must induce one by swapping the - // strides and dimensions. Note that this transposition would normally - // be handled explicitly in the packing of C, but if C is not being - // packed, this is our last chance to handle the transposition. - if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) - { - //if( bli_thread_am_ochief( thread ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); - // } - } - // If alpha is non-unit, typecast and apply it to the scalar attached // to B. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -133,29 +101,33 @@ void bli_gemm_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); - // Index into the variant array to extract the correct function pointer. - f = vars[n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Somewhat hackish support for 3m3, 3m2, and 4m1b method implementations. - im = bli_cntx_get_ind_method( cntx ); - - if ( im != BLIS_NAT ) { - if ( im == BLIS_3M3 && f == bli_gemm_blk_var1f ) f = bli_gemm_blk_var4f; - else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var4; - else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm_ker_var3; + ind_t im = bli_cntx_get_ind_method( cntx ); + + if ( im != BLIS_NAT ) + { + if ( im == BLIS_3M3 && f == bli_gemm_packa ) f = bli_gemm3m3_packa; + else if ( im == BLIS_3M2 && f == bli_gemm_ker_var2 ) f = bli_gemm3m2_ker_var2; + else if ( im == BLIS_4M1B && f == bli_gemm_ker_var2 ) f = bli_gemm4mb_ker_var2; + } } // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/gemm/bli_gemm_int.h b/frame/3/gemm/bli_gemm_int.h index 9177122fd..e8580cf95 100644 --- a/frame/3/gemm/bli_gemm_int.h +++ b/frame/3/gemm/bli_gemm_int.h @@ -32,12 +32,15 @@ */ -void bli_gemm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_gemm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 036876cb6..b44564387 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -56,12 +56,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); -void bli_gemm_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -236,7 +239,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c new file mode 100644 index 000000000..c0166c828 --- /dev/null +++ b/frame/3/gemm/bli_gemm_packab.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + +// ----------------------------------------------------------------------------- + +void bli_gemm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_gemm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 0f7ecdb11..c66587fda 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -46,20 +46,22 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( gemm_blk_var1f ) -GENPROT( gemm_blk_var2f ) -GENPROT( gemm_blk_var3f ) +GENPROT( gemm_blk_var1 ) +GENPROT( gemm_blk_var2 ) +GENPROT( gemm_blk_var3 ) +GENPROT( gemm_packa ) +GENPROT( gemm_packb ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: -GENPROT( gemm_blk_var4f ) // 3m3 -GENPROT( gemm_ker_var3 ) // 4m1b -GENPROT( gemm_ker_var4 ) // 3m2 +GENPROT( gemm3m3_packa ) // 3m3 +GENPROT( gemm4mb_ker_var2 ) // 4m1b +GENPROT( gemm3m2_ker_var2 ) // 3m2 // @@ -90,6 +92,6 @@ void PASTEMAC(ch,varname) \ INSERT_GENTPROT_BASIC( gemm_ker_var2 ) // Headers for induced algorithms: -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) // 4m1b -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) // 3m2 +INSERT_GENTPROT_BASIC( gemm4mb_ker_var2 ) // 4m1b +INSERT_GENTPROT_BASIC( gemm3m2_ker_var2 ) // 3m2 diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.c b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c similarity index 95% rename from frame/3/gemm/ind/bli_gemm_ker_var4.c rename to frame/3/gemm/ind/bli_gemm3m2_ker_var2.c index 3d5cd1859..ea8904183 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.c +++ b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c @@ -53,15 +53,18 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4); +static FUNCPTR_T GENARRAY(ftypes,gemm3m2_ker_var2); -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm3m2_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -238,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -351,9 +354,9 @@ void PASTEMAC(ch,varname) \ } \ } \ \ -/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var4: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ +/*PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm3m2_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var4 ) +INSERT_GENTFUNC_BASIC0( gemm3m2_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm3m3_packa.c b/frame/3/gemm/ind/bli_gemm3m3_packa.c new file mode 100644 index 000000000..516047213 --- /dev/null +++ b/frame/3/gemm/ind/bli_gemm3m3_packa.c @@ -0,0 +1,142 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm3m3_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Make a copy of the context for each stage. + cntx_t cntx_ro = *cntx; + cntx_t cntx_io = *cntx; + cntx_t cntx_rpi = *cntx; + + // ----------------------------------------------------- + + // Initialize the context for the real-only stage. + bli_gemm3m3_cntx_stage( 0, &cntx_ro ); + + // Pack matrix the real-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_ro, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // Only apply beta within the first of three subproblems. + bli_obj_scalar_reset( c ); + + // ----------------------------------------------------- + + // Initialize the context for the imag-only stage. + bli_gemm3m3_cntx_stage( 1, &cntx_io ); + + // Pack matrix the imag-only part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_io, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // ----------------------------------------------------- + + // Initialize the context for the real+imag stage. + bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); + + // Pack matrix the real+imag part of A. + bli_l3_packm + ( + a, + &a_pack, + &cntx_rpi, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_gemm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + +} + diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c similarity index 96% rename from frame/3/gemm/ind/bli_gemm_ker_var3.c rename to frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index 11c684810..d9d714917 100644 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -53,15 +53,18 @@ typedef void (*FUNCPTR_T)( thrinfo_t* thread ); -static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3); +static FUNCPTR_T GENARRAY(ftypes,gemm4mb_ker_var2); -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_gemm4mb_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -238,7 +241,7 @@ void PASTEMAC(ch,varname) \ bli_auxinfo_set_is_a( is_a, aux ); \ bli_auxinfo_set_is_b( is_b, aux ); \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ @@ -349,5 +352,5 @@ void PASTEMAC(ch,varname) \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var3: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } -INSERT_GENTFUNC_BASIC0( gemm_ker_var3 ) +INSERT_GENTFUNC_BASIC0( gemm4mb_ker_var2 ) diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.c b/frame/3/gemm/ind/bli_gemm_blk_var4f.c deleted file mode 100644 index 9308014d0..000000000 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - //The s is for "lives on the stack" - obj_t b_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack = NULL; - obj_t* b_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Make a copy of the context for each stage. - cntx_t cntx_ro = *cntx; - cntx_t cntx_io = *cntx; - cntx_t cntx_rpi = *cntx; - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing B. - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by - // chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize objects passed into bli_packm_init for A and C - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack B (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: Use of a (for execution datatype) is intentional! - // This causes the right blocksize to be used if c and a are - // complex and b is real. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - - // Initialize the context for the real-only stage. - bli_gemm3m3_cntx_stage( 0, &cntx_ro ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_ro, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - &cntx_ro, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Only apply beta within the first of three subproblems. - if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack ); - - - // Initialize the context for the imag-only stage. - bli_gemm3m3_cntx_stage( 1, &cntx_io ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_io, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (imag-only). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Initialize the context for the real+imag stage. - bli_gemm3m3_cntx_stage( 2, &cntx_rpi ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - &cntx_rpi, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform gemm subproblem (real+imag). - bli_gemm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - - // Unpack C1 (if C1 was packed). - // Currently must be done by 1 thread - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ){ - // It doesn't matter which packm cntl node we pass in, as long - // as it is valid, packm_release() will release the mem_t entry - // stored in a1_pack. - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.h b/frame/3/gemm/ind/bli_gemm_blk_var4f.h deleted file mode 100644 index 289e76550..000000000 --- a/frame/3/gemm/ind/bli_gemm_blk_var4f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_gemm_blk_var4f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/gemm/ind/bli_gemm_ker_var3.h b/frame/3/gemm/ind/bli_gemm_ker_var3.h deleted file mode 100644 index 042120185..000000000 --- a/frame/3/gemm/ind/bli_gemm_ker_var3.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_gemm_ker_var3( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( gemm_ker_var3 ) - diff --git a/frame/3/gemm/ind/bli_gemm_ker_var4.h b/frame/3/gemm/ind/bli_gemm_ker_var4.h deleted file mode 100644 index 95268de2a..000000000 --- a/frame/3/gemm/ind/bli_gemm_ker_var4.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_gemm_ker_var4( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( gemm_ker_var4 ) - diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 21bda90da..8bede097b 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_hemm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -71,7 +74,7 @@ void bli_hemm_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_toggle_conj( a_local ); @@ -86,22 +89,23 @@ void bli_hemm_front( side_t side, bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h index 840b24791..e1d40c80e 100644 --- a/frame/3/hemm/bli_hemm_front.h +++ b/frame/3/hemm/bli_hemm_front.h @@ -32,12 +32,14 @@ */ -void bli_hemm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_hemm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 263155de2..7350b5785 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t alpha_conj; obj_t c_local; @@ -64,7 +67,7 @@ void bli_her2k_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -91,7 +94,7 @@ void bli_her2k_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); @@ -104,49 +107,38 @@ void bli_her2k_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bh_local, - &alpha_conj, - &b_local, - &ah_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); + + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx ); // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bh_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &bh_local, + beta, + &c_local, + cntx, + cntl + ); - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - &alpha_conj, - &b_local, - &ah_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); - -#endif + bli_l3_thread_decorator + ( + bli_gemm_int, + &alpha_conj, + &b_local, + &ah_local, + &BLIS_ONE, + &c_local, + cntx, + cntl + ); // The Hermitian rank-2k product was computed as A*B'+B*A', even for // the diagonal elements. Mathematically, the imaginary components of @@ -155,6 +147,5 @@ void bli_her2k_front( obj_t* alpha, // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/her2k/bli_her2k_front.h b/frame/3/her2k/bli_her2k_front.h index 8a699c4c4..6f1246ea6 100644 --- a/frame/3/her2k/bli_her2k_front.h +++ b/frame/3/her2k/bli_her2k_front.h @@ -32,11 +32,13 @@ */ -void bli_her2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_her2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/herk/bli_herk.h b/frame/3/herk/bli_herk.h index 290b8bda3..d9aebc78b 100644 --- a/frame/3/herk/bli_herk.h +++ b/frame/3/herk/bli_herk.h @@ -33,7 +33,6 @@ */ #include "bli_herk_front.h" -#include "bli_herk_int.h" #include "bli_herk_var.h" diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c deleted file mode 100644 index 95bc56f9c..000000000 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t ah_pack_s; - obj_t a1_pack_s, c1_pack_s; - - obj_t a1, c1; - obj_t* a1_pack; - obj_t* c1_pack; - obj_t* ah_pack; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_m( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A'. - bli_obj_init_pack( &ah_pack_s ); - bli_packm_init( ah, &ah_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - - // Scale C by beta (if instructed). - // Since scalm doesn't support multithreading yet, must be done by chief thread (ew) - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - ah_pack = bli_thread_obroadcast( thread, &ah_pack_s ); - - // Initialize pack objects that are passed into packm_init() for A and C. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A' (if instructed). - bli_packm_int( ah, ah_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_t2b( thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( ah_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c deleted file mode 100644 index de7f6c972..000000000 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t ah1_pack_s, c1_pack_s; - - obj_t ah1, c1; - obj_t* a_pack; - obj_t* ah1_pack; - obj_t* c1_pack; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_n( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing A - bli_obj_init_pack( &a_pack_s ); - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for C and A' that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &ah1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_weighted_l2r( thread, c, - bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1' and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1' and C1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ) ; - - // Pack A1' (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ) ; - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a_pack, - ah1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c deleted file mode 100644 index 7e82ba87f..000000000 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, ah1_pack_s; - - obj_t a1, ah1; - obj_t* a1_pack = NULL; - obj_t* ah1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Prune any zero region that exists along the partitioning dimension. - bli_herk_prune_unref_mparts_k( a, ah, c ); - - if( bli_thread_am_ochief( thread ) ) { - // Initialize object for packing C. - bli_obj_init_pack( &c_pack_s ); - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - // Initialize all pack objects that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &ah1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, k_trans, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and A1'. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, ah, &ah1 ); - - // Initialize objects for packing A1 and A1'. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &ah1, ah1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform herk subproblem. - bli_herk_int( &BLIS_ONE, - a1_pack, - ah1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_gemm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal beta scalar on matrix C is non-zero, we must use it - // only for the first iteration (and then BLIS_ONE for all others). - // And since c_pack is a local obj_t, we can simply overwrite the - // internal beta scalar with BLIS_ONE once it has been used in the - // first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); - - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index a4bd5ef0b..7fcd2d356 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t ah_local; @@ -60,7 +63,7 @@ void bli_herk_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -76,7 +79,7 @@ void bli_herk_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); @@ -84,22 +87,24 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( c_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &ah_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &ah_local, + beta, + &c_local, + cntx, + cntl + ); // The Hermitian rank-k product was computed as A*A', even for the // diagonal elements. Mathematically, the imaginary components of @@ -108,6 +113,5 @@ void bli_herk_front( obj_t* alpha, // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, &c_local ); - } diff --git a/frame/3/herk/bli_herk_front.h b/frame/3/herk/bli_herk_front.h index c778399d0..ef9325969 100644 --- a/frame/3/herk/bli_herk_front.h +++ b/frame/3/herk/bli_herk_front.h @@ -32,10 +32,12 @@ */ -void bli_herk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_herk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index 94d6f6a77..c36b6b826 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2); -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_l_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -270,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index cc137d989..56da59f1a 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -57,12 +57,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2); -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_u_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -270,7 +273,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ diff --git a/frame/3/herk/bli_herk_var.h b/frame/3/herk/bli_herk_var.h index 03d9b9ff5..a18c9ab49 100644 --- a/frame/3/herk/bli_herk_var.h +++ b/frame/3/herk/bli_herk_var.h @@ -46,16 +46,19 @@ void PASTEMAC0(opname) \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( herk_blk_var1f ) -GENPROT( herk_blk_var2f ) -GENPROT( herk_blk_var3f ) +//GENPROT( herk_blk_var1 ) +//GENPROT( herk_blk_var2 ) +//GENPROT( herk_blk_var3 ) +GENPROT( herk_x_ker_var2 ) GENPROT( herk_l_ker_var2 ) GENPROT( herk_u_ker_var2 ) +//GENPROT( herk_packa ) +//GENPROT( herk_packb ) // diff --git a/frame/3/herk/bli_herk_x_ker_var2.c b/frame/3/herk/bli_herk_x_ker_var2.c new file mode 100644 index 000000000..71a4cc59b --- /dev/null +++ b/frame/3/herk/bli_herk_x_ker_var2.c @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static gemm_voft vars[2] = +{ + bli_herk_l_ker_var2, bli_herk_u_ker_var2, +}; + +void bli_herk_x_ker_var2 + ( + obj_t* a, + obj_t* ah, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bool_t uplo; + gemm_voft f; + + // Set a bool based on the uplo field of C's root object. + if ( bli_obj_root_is_lower( *c ) ) uplo = 0; + else uplo = 1; + + // Index into the variant array to extract the correct function pointer. + f = vars[uplo]; + + // Call the macrokernel. + f + ( + a, + ah, + c, + cntx, + cntl, + thread + ); +} + diff --git a/frame/include/bli_mem_macro_defs.h b/frame/3/herk/old/bli_herk_blk_var1.c similarity index 54% rename from frame/include/bli_mem_macro_defs.h rename to frame/3/herk/old/bli_herk_blk_var1.c index 51840b712..59a20e878 100644 --- a/frame/include/bli_mem_macro_defs.h +++ b/frame/3/herk/old/bli_herk_blk_var1.c @@ -32,84 +32,67 @@ */ -#ifndef BLIS_MEM_MACRO_DEFS_H -#define BLIS_MEM_MACRO_DEFS_H +#include "blis.h" +void bli_herk_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; -// Mem entry query + dir_t direct; -#define bli_mem_pblk( mem_p ) \ -\ - ( &((mem_p)->pblk) ) + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; -#define bli_mem_buffer( mem_p ) \ -\ - ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); -#define bli_mem_buf_sys( mem_p ) \ -\ - ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_m( a, b, c ); -#define bli_mem_buf_type( mem_p ) \ -\ - ( (mem_p)->buf_type ) + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); -#define bli_mem_pool( mem_p ) \ -\ - ( (mem_p)->pool ) + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); -#define bli_mem_size( mem_p ) \ -\ - ( (mem_p)->size ) + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); -#define bli_mem_is_alloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) != NULL ) + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); -#define bli_mem_is_unalloc( mem_p ) \ -\ - ( bli_mem_buffer( mem_p ) == NULL ) - - -// Mem entry modification - -#define bli_mem_set_pblk( pblk_p, mem_p ) \ -{ \ - mem_p->pblk = *(pblk_p); \ + bli_thread_ibarrier( thread ); + } } -#define bli_mem_set_buffer( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_sys( buf0, mem_p ) \ -{ \ - bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ -} - -#define bli_mem_set_buf_type( buf_type0, mem_p ) \ -{ \ - mem_p->buf_type = buf_type0; \ -} - -#define bli_mem_set_pool( pool0, mem_p ) \ -{ \ - mem_p->pool = pool0; \ -} - -#define bli_mem_set_size( size0, mem_p ) \ -{ \ - mem_p->size = size0; \ -} - -#define bli_mem_clear( mem_p ) \ -{ \ - bli_mem_set_buffer( NULL, mem_p ); \ - bli_mem_set_buf_sys( NULL, mem_p ); \ - bli_mem_set_pool( NULL, mem_p ); \ - bli_mem_set_size( 0, mem_p ); \ -} - - -#endif diff --git a/frame/3/herk/old/bli_herk_blk_var1f.h b/frame/3/herk/old/bli_herk_blk_var1f.h deleted file mode 100644 index bd1d8a95f..000000000 --- a/frame/3/herk/old/bli_herk_blk_var1f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_herk_blk_var1f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - diff --git a/frame/3/herk/old/bli_herk_blk_var2.c b/frame/3/herk/old/bli_herk_blk_var2.c new file mode 100644 index 000000000..739ae0341 --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_n( a, ah, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/herk/old/bli_herk_blk_var3.c b/frame/3/herk/old/bli_herk_blk_var3.c new file mode 100644 index 000000000..949ab53da --- /dev/null +++ b/frame/3/herk/old/bli_herk_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_herk_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_herk_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_herk_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + // Notice that, unlike with gemm/hemm/symm/trmm/trsm, we do not need + // to call a kc-specific routine. We do not need kc to be a multiple + // of MR or NR since neither A nor B has structure in herk. + b_alg = bli_determine_blocksize( direct, i, k_trans, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform herk subproblem. + bli_herk_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is an aliased obj_t (see _int() function), we can + // simply overwrite the internal beta scalar with BLIS_ONE once it + // has been used in the first iteration. + if ( i == 0 && bli_thread_am_ichief( thread ) ) + bli_obj_scalar_reset( c ); + } +} + diff --git a/frame/3/herk/bli_herk_int.c b/frame/3/herk/old/bli_herk_int.c similarity index 67% rename from frame/3/herk/bli_herk_int.c rename to frame/3/herk/old/bli_herk_int.c index 643a46ba4..b7d58940b 100644 --- a/frame/3/herk/bli_herk_int.c +++ b/frame/3/herk/old/bli_herk_int.c @@ -34,51 +34,38 @@ #include "blis.h" -#define FUNCPTR_T herk_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][4][3] = +#if 0 +static gemm_voft vars[4][3] = { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_l_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_herk_blk_var1f }, - { NULL, bli_herk_u_ker_var2, bli_herk_blk_var2f }, - { NULL, NULL, bli_herk_blk_var3f }, - { NULL, NULL, NULL }, - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_herk_blk_var1 }, + { NULL, bli_herk_x_ker_var2, bli_herk_blk_var2 }, + { NULL, NULL, bli_herk_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t ah_local; obj_t c_local; +#if 0 + bool_t uplo; +#endif varnum_t n; impl_t i; - bool_t uplo; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -91,9 +78,9 @@ void bli_herk_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *ah ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -109,43 +96,55 @@ void bli_herk_int( obj_t* alpha, // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar // attached to A'. if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &ah_local ); + bli_obj_scalar_apply_scalar( alpha, &ah_local ); } // If beta is non-unit, typecast and apply it to the scalar // attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c_local ) ) uplo = 0; else uplo = 1; +#endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[uplo][n][i]; + f = vars[n][i]; +#endif + + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &ah_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &ah_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/herk/bli_herk_int.h b/frame/3/herk/old/bli_herk_int.h similarity index 86% rename from frame/3/herk/bli_herk_int.h rename to frame/3/herk/old/bli_herk_int.h index 80442d228..1e649b968 100644 --- a/frame/3/herk/bli_herk_int.h +++ b/frame/3/herk/old/bli_herk_int.h @@ -32,12 +32,14 @@ */ -void bli_herk_int( obj_t* alpha, - obj_t* a, - obj_t* ah, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - +void bli_herk_int + ( + obj_t* alpha, + obj_t* a, + obj_t* ah, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); diff --git a/frame/3/herk/old/bli_herk_l_ker_var2.h b/frame/3/herk/old/bli_herk_l_ker_var2.h deleted file mode 100644 index 09656596d..000000000 --- a/frame/3/herk/old/bli_herk_l_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_l_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_l_ker_var2 ) - diff --git a/frame/3/herk/old/bli_herk_thread.c b/frame/3/herk/old/bli_herk_thread.c deleted file mode 100644 index 6bb9d6e98..000000000 --- a/frame/3/herk/old/bli_herk_thread.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_herk_thrinfo_create_paths( void ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/herk/old/bli_herk_u_ker_var2.h b/frame/3/herk/old/bli_herk_u_ker_var2.h deleted file mode 100644 index 0701db148..000000000 --- a/frame/3/herk/old/bli_herk_u_ker_var2.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_herk_u_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - herk_thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( herk_u_ker_var2 ) - diff --git a/frame/3/old/bli_herk_direct.c b/frame/3/old/bli_herk_direct.c new file mode 100644 index 000000000..729812e84 --- /dev/null +++ b/frame/3/old/bli_herk_direct.c @@ -0,0 +1,46 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ) +{ + return BLIS_FWD; +} + diff --git a/frame/cntl/bli_cntl_init.h b/frame/3/old/bli_herk_direct.h similarity index 94% rename from frame/cntl/bli_cntl_init.h rename to frame/3/old/bli_herk_direct.h index a3fdf6279..1f027561c 100644 --- a/frame/cntl/bli_cntl_init.h +++ b/frame/3/old/bli_herk_direct.h @@ -32,6 +32,9 @@ */ -void bli_cntl_init( void ); -void bli_cntl_finalize( void ); -bool_t bli_cntl_is_initialized( void ); +dir_t bli_herk_direct + ( + obj_t* a, + obj_t* ah, + obj_t* c + ); diff --git a/frame/3/trsm/old/bli_trsm_thread.h b/frame/3/old/bli_trmm_direct.c similarity index 78% rename from frame/3/trsm/old/bli_trsm_thread.h rename to frame/3/old/bli_trmm_direct.c index 985b6c4a6..43be1b16a 100644 --- a/frame/3/trsm/old/bli_trsm_thread.h +++ b/frame/3/old/bli_trmm_direct.c @@ -32,11 +32,28 @@ */ -#define bli_thrinfo_sub_self( thread ) thread->sub_l3op -#define bli_thrinfo_sub_opackm( thread ) thread->opackm -#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm +#include "blis.h" -#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way ) +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; -//thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ); + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + + return direct; +} diff --git a/frame/cntl/bli_cntl.c b/frame/3/old/bli_trmm_direct.h similarity index 95% rename from frame/cntl/bli_cntl.c rename to frame/3/old/bli_trmm_direct.h index ffd6120c8..905ba8fc9 100644 --- a/frame/cntl/bli_cntl.c +++ b/frame/3/old/bli_trmm_direct.h @@ -32,9 +32,10 @@ */ -#include "blis.h" +dir_t bli_trmm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); -void bli_cntl_obj_free( void* cntl ) -{ - bli_free_intl( cntl ); -} diff --git a/frame/3/old/bli_trsm_direct.c b/frame/3/old/bli_trsm_direct.c new file mode 100644 index 000000000..c640705c8 --- /dev/null +++ b/frame/3/old/bli_trsm_direct.c @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ) +{ + dir_t direct; + + if ( bli_obj_root_is_triangular( *a ) ) + { + if ( bli_obj_root_is_lower( *a ) ) direct = BLIS_FWD; + else direct = BLIS_BWD; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + if ( bli_obj_root_is_lower( *b ) ) direct = BLIS_BWD; + else direct = BLIS_FWD; + } + + return direct; +} + diff --git a/frame/3/old/bli_trsm_direct.h b/frame/3/old/bli_trsm_direct.h new file mode 100644 index 000000000..d7e7c206b --- /dev/null +++ b/frame/3/old/bli_trsm_direct.h @@ -0,0 +1,41 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +dir_t bli_trsm_direct + ( + obj_t* a, + obj_t* b, + obj_t* c + ); + diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 79208b699..cd2f3a20e 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_symm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -71,7 +74,7 @@ void bli_symm_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); @@ -85,22 +88,23 @@ void bli_symm_front( side_t side, bli_obj_swap( a_local, b_local ); } - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_gemm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_GEMM, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h index 1fb9ec019..6ba9a5aeb 100644 --- a/frame/3/symm/bli_symm_front.h +++ b/frame/3/symm/bli_symm_front.h @@ -32,12 +32,14 @@ */ -void bli_symm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_symm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 2fa47d27a..47ce91795 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -34,13 +34,16 @@ #include "blis.h" -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t c_local; obj_t a_local; @@ -61,7 +64,7 @@ void bli_syr2k_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -80,52 +83,42 @@ void bli_syr2k_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } -#if 0 - // Invoke the internal back-end. - bli_her2k_int( alpha, - &a_local, - &bt_local, - alpha, - &b_local, - &at_local, - beta, - &c_local, - cntl ); -#else + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); + + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx ); + // Invoke herk twice, using beta only the first time. - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &bt_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &b_local, - &at_local, - &BLIS_ONE, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); -#endif + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &bt_local, + beta, + &c_local, + cntx, + cntl + ); + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &b_local, + &at_local, + &BLIS_ONE, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/syr2k/bli_syr2k_front.h b/frame/3/syr2k/bli_syr2k_front.h index 674dfe5ce..8d227c125 100644 --- a/frame/3/syr2k/bli_syr2k_front.h +++ b/frame/3/syr2k/bli_syr2k_front.h @@ -32,11 +32,13 @@ */ -void bli_syr2k_front( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syr2k_front + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 54ca2bf8a..f037eb1c1 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t at_local; @@ -58,7 +61,7 @@ void bli_syrk_front( obj_t* alpha, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and C in case we need to apply transformations. bli_obj_alias_to( *a, a_local ); @@ -73,27 +76,28 @@ void bli_syrk_front( obj_t* alpha, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_induce_trans( c_local ); } - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_herk_int, - alpha, - &a_local, - &at_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_HERK, cntx ); - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &at_local, + beta, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/syrk/bli_syrk_front.h b/frame/3/syrk/bli_syrk_front.h index c7ab2a7b7..73f58baef 100644 --- a/frame/3/syrk/bli_syrk_front.h +++ b/frame/3/syrk/bli_syrk_front.h @@ -32,10 +32,12 @@ */ -void bli_syrk_front( obj_t* alpha, - obj_t* a, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_syrk_front + ( + obj_t* alpha, + obj_t* a, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm.h b/frame/3/trmm/bli_trmm.h index 056fedb50..4eeec84e0 100644 --- a/frame/3/trmm/bli_trmm.h +++ b/frame/3/trmm/bli_trmm.h @@ -33,7 +33,6 @@ */ #include "bli_trmm_front.h" -#include "bli_trmm_int.h" #include "bli_trmm_var.h" diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 9b860405c..c7231c839 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -34,12 +34,15 @@ #include "blis.h" -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -58,7 +61,7 @@ void bli_trmm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -104,7 +107,7 @@ void bli_trmm_front( side_t side, // NOTE: We disable the optimization for 1x1 matrices since the concept // of row- vs. column storage breaks down. if ( !bli_obj_is_1x1( c_local ) ) - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -127,24 +130,24 @@ void bli_trmm_front( side_t side, bli_obj_set_as_root( a_local ); bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - &BLIS_ZERO, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &b_local, + &BLIS_ZERO, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h index a05284336..7a263fdb1 100644 --- a/frame/3/trmm/bli_trmm_front.h +++ b/frame/3/trmm/bli_trmm_front.h @@ -32,10 +32,12 @@ */ -void bli_trmm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - gemm_t* cntl ); - +void bli_trmm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 34928b04d..cc729834b 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -308,7 +311,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 382d54952..eacf91795 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -315,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 72ac03a14..f8b09a3f5 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -315,7 +318,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index 0bae832d3..3fb94c9d6 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -316,7 +319,7 @@ void PASTEMAC(ch,varname) \ b1 = b_cast; \ c1 = c_cast; \ \ - thrinfo_t* ir_thread = bli_thrinfo_sub_self( jr_thread ); \ + thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index e10166401..d3ac2fa34 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -46,17 +46,15 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - gemm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( trmm_blk_var1f ) -//GENPROT( trmm_blk_var1b ) // variant doesn't exist b/c it's not needed -GENPROT( trmm_blk_var2f ) -GENPROT( trmm_blk_var2b ) -GENPROT( trmm_blk_var3f ) -GENPROT( trmm_blk_var3b ) +//GENPROT( trmm_blk_var1 ) +//GENPROT( trmm_blk_var2 ) +//GENPROT( trmm_blk_var3 ) +GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) diff --git a/frame/cntl/bli_cntl.h b/frame/3/trmm/bli_trmm_xx_ker_var2.c similarity index 61% rename from frame/cntl/bli_cntl.h rename to frame/3/trmm/bli_trmm_xx_ker_var2.c index c53270f9b..cbec35678 100644 --- a/frame/cntl/bli_cntl.h +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -32,53 +32,56 @@ */ -#include "bli_cntl_init.h" +#include "blis.h" -typedef enum +static gemm_voft vars[2][2] = { - BLIS_UNBLOCKED = 0, - BLIS_UNB_FUSED = 1, - BLIS_UNB_OPT = 1, - BLIS_BLOCKED = 2 -} impl_t; + { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, + { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } +}; -typedef enum +void bli_trmm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { - BLIS_VARIANT1 = 0, - BLIS_VARIANT2, - BLIS_VARIANT3, - BLIS_VARIANT4, - BLIS_VARIANT5, - BLIS_VARIANT6, - BLIS_VARIANT7, - BLIS_VARIANT8, - BLIS_VARIANT9, -} varnum_t; + bool_t side; + bool_t uplo; + gemm_voft f; + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } -void bli_cntl_obj_free( void* cntl ); + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; - - -// -- Control tree accessor macros (common to many node types) -- - -#define bli_cntl_impl_type( cntl ) cntl->impl_type -#define bli_cntl_var_num( cntl ) cntl->var_num -#define bli_cntl_bszid( cntl ) cntl->bszid - - - -// -- Control tree query macros -- - -#define bli_cntl_is_noop( cntl ) \ -\ - ( cntl == NULL ) - -#define bli_cntl_is_leaf( cntl ) \ -\ - ( bli_cntl_impl_type( cntl ) != BLIS_BLOCKED ) - -#define bli_cntl_is_blocked( cntl ) \ -\ - ( bli_cntl_impl_type( cntl ) == BLIS_BLOCKED ) + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/trmm/old/bli_trmm_blk_var1.c b/frame/3/trmm/old/bli_trmm_blk_var1.c new file mode 100644 index 000000000..9f2e91d07 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var1.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_m( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_mdim + ( + direct, thread, a, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/old/bli_trmm_blk_var1f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var1f.c rename to frame/3/trmm/old/bli_trmm_blk_var1f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var1f.h b/frame/3/trmm/old/bli_trmm_blk_var1f.h deleted file mode 100644 index e0876af88..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var1f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/old/bli_trmm_blk_var2.c b/frame/3/trmm/old/bli_trmm_blk_var2.c new file mode 100644 index 000000000..df5f58614 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var2.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_n( a, b, c ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_weighted_ndim + ( + direct, thread, b, + bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ), + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a, + b1, + &BLIS_ONE, + c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/old/bli_trmm_blk_var2b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2b.c rename to frame/3/trmm/old/bli_trmm_blk_var2b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2b.h b/frame/3/trmm/old/bli_trmm_blk_var2b.h deleted file mode 100644 index 35f41a9af..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/old/bli_trmm_blk_var2f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var2f.c rename to frame/3/trmm/old/bli_trmm_blk_var2f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var2f.h b/frame/3/trmm/old/bli_trmm_blk_var2f.h deleted file mode 100644 index 7ed265e42..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var2f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/old/bli_trmm_blk_var3.c b/frame/3/trmm/old/bli_trmm_blk_var3.c new file mode 100644 index 000000000..2957cf153 --- /dev/null +++ b/frame/3/trmm/old/bli_trmm_blk_var3.c @@ -0,0 +1,105 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_trmm_direct( a, b, c ); + + // Prune any zero region that exists along the partitioning dimension. + bli_trmm_prune_unref_mparts_k( a, b, c ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trmm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform trmm subproblem. + bli_trmm_int + ( + &BLIS_ONE, + a1, + b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_self( thread ) + ); + + bli_thread_ibarrier( thread ); + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that beta must be zero, since the macro- + // kernel only applies beta to the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern works for trmm3 as well--by only applying beta to + // the current row-panel of C, beta is applied to all of C exactly + // once. Thus, for neither trmm nor trmm3 should we reset the scalar + // on C after the first iteration. + } +} + diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/old/bli_trmm_blk_var3b.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3b.c rename to frame/3/trmm/old/bli_trmm_blk_var3b.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3b.h b/frame/3/trmm/old/bli_trmm_blk_var3b.h deleted file mode 100644 index 4e9113c6a..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3b.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/old/bli_trmm_blk_var3f.c similarity index 100% rename from frame/3/trmm/bli_trmm_blk_var3f.c rename to frame/3/trmm/old/bli_trmm_blk_var3f.c diff --git a/frame/3/trmm/old/bli_trmm_blk_var3f.h b/frame/3/trmm/old/bli_trmm_blk_var3f.h deleted file mode 100644 index 50d8c6bbb..000000000 --- a/frame/3/trmm/old/bli_trmm_blk_var3f.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trmm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trmm/bli_trmm_int.c b/frame/3/trmm/old/bli_trmm_int.c similarity index 64% rename from frame/3/trmm/bli_trmm_int.c rename to frame/3/trmm/old/bli_trmm_int.c index d39722e95..830a22d1f 100644 --- a/frame/3/trmm/bli_trmm_int.c +++ b/frame/3/trmm/old/bli_trmm_int.c @@ -34,73 +34,38 @@ #include "blis.h" -#define FUNCPTR_T trmm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = +#if 0 +static gemm_voft vars[4][3] = { - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ll_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_lu_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_rl_ker_var2, bli_trmm_blk_var2f }, - { NULL, NULL, bli_trmm_blk_var3f }, - { NULL, NULL, NULL }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trmm_blk_var1f }, - { NULL, bli_trmm_ru_ker_var2, bli_trmm_blk_var2b }, - { NULL, NULL, bli_trmm_blk_var3b }, - { NULL, NULL, NULL }, - } - } + // unblocked optimized unblocked blocked + { NULL, NULL, bli_trmm_blk_var1 }, + { NULL, bli_trmm_xx_ker_var2, bli_trmm_blk_var2 }, + { NULL, NULL, bli_trmm_blk_var3 }, + { NULL, NULL, NULL }, }; +#endif -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ) +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; +#if 0 bool_t side, uplo; +#endif varnum_t n; impl_t i; - FUNCPTR_T f; + gemm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -131,11 +96,13 @@ void bli_trmm_int( obj_t* alpha, // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. +#if 0 if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { bli_obj_induce_trans( c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } +#endif // If alpha is non-unit, typecast and apply it to the scalar attached // to B. @@ -151,6 +118,7 @@ void bli_trmm_int( obj_t* alpha, bli_obj_scalar_apply_scalar( beta, &c_local ); } +#if 0 // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). @@ -163,24 +131,32 @@ void bli_trmm_int( obj_t* alpha, else // if ( bli_obj_root_is_triangular( *b ) ) { side = 1; - // Set a bool based on the uplo field of A's root object. if ( bli_obj_root_is_lower( *b ) ) uplo = 0; else uplo = 1; } +#endif +#if 0 // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + f = vars[n][i]; +#endif + + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trmm/bli_trmm_int.h b/frame/3/trmm/old/bli_trmm_int.h similarity index 86% rename from frame/3/trmm/bli_trmm_int.h rename to frame/3/trmm/old/bli_trmm_int.h index e529d02f6..697fc06b5 100644 --- a/frame/3/trmm/bli_trmm_int.h +++ b/frame/3/trmm/old/bli_trmm_int.h @@ -32,11 +32,15 @@ */ -void bli_trmm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); +void bli_trmm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h b/frame/3/trmm/old/bli_trmm_ll_ker_var2.h deleted file mode 100644 index 384defe09..000000000 --- a/frame/3/trmm/old/bli_trmm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h b/frame/3/trmm/old/bli_trmm_lu_ker_var2.h deleted file mode 100644 index 74a17e6b4..000000000 --- a/frame/3/trmm/old/bli_trmm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h b/frame/3/trmm/old/bli_trmm_rl_ker_var2.h deleted file mode 100644 index 64d1128fb..000000000 --- a/frame/3/trmm/old/bli_trmm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h b/frame/3/trmm/old/bli_trmm_ru_ker_var2.h deleted file mode 100644 index 3df303b60..000000000 --- a/frame/3/trmm/old/bli_trmm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trmm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffb, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 ) - diff --git a/frame/3/trmm/old/bli_trmm_thread.c b/frame/3/trmm/old/bli_trmm_thread.c deleted file mode 100644 index b17c30dd6..000000000 --- a/frame/3/trmm/old/bli_trmm_thread.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trmm_thrinfo_create_paths( bool_t jc_dependency ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( jc_dependency ) - { - jr_way *= jc_way; - jc_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 19090bee5..cf97bbcf2 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -34,14 +34,17 @@ #include "blis.h" -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ) +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) { obj_t a_local; obj_t b_local; @@ -60,7 +63,7 @@ void bli_trmm3_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -103,7 +106,7 @@ void bli_trmm3_front( side_t side, // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. - if ( bli_cntx_l3_nat_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) + if ( bli_cntx_l3_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( side ); bli_obj_induce_trans( a_local ); @@ -127,22 +130,23 @@ void bli_trmm3_front( side_t side, bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRMM3, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRMM, cntx ); - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trmm_int, - alpha, - &a_local, - &b_local, - beta, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_gemm_int, + alpha, + &a_local, + &b_local, + beta, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h index 052d83249..ed158c0b8 100644 --- a/frame/3/trmm3/bli_trmm3_front.h +++ b/frame/3/trmm3/bli_trmm3_front.h @@ -32,11 +32,14 @@ */ -void bli_trmm3_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl ); +void bli_trmm3_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c new file mode 100644 index 000000000..a731d8265 --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the m dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1, + b, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c deleted file mode 100644 index b3a73da6e..000000000 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t b_pack_s; - obj_t a1_pack_s; - - obj_t a1, c1; - obj_t* b_pack = NULL; - obj_t* a1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_m( a, b, c ); - - // Initialize object for packing B. - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &b_pack_s ); - bli_packm_init( b, &b_pack_s, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - b_pack = bli_thread_obroadcast( thread, &b_pack_s ); - - // Initialize object for packing B. - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - - // Pack B1 (if instructed). - bli_packm_int( b, b_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_t2b( thread, a, - ( bli_obj_root_is_triangular( *a ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); - - // Partition along the remaining portion of the m dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, a, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and C1. - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize object for packing A1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b_pack, - &BLIS_ONE, - &c1, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) ); - if( bli_thread_am_ichief( thread ) ) - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); -} - diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c new file mode 100644 index 000000000..a133f0bb0 --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var2.c @@ -0,0 +1,95 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b1, c1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_n( a, b, c, cntx ); + + // Determine the current thread's subpartition range. + bli_thread_get_range_ndim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // Partition along the n dimension. + for ( i = my_start; i < my_end; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_determine_blocksize( direct, i, my_end, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for B1 and C1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, c, &c1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + a, + &b1, + &BLIS_ONE, + &c1, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c deleted file mode 100644 index 42d65100e..000000000 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var2f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t a_pack_s; - obj_t b1_pack_s, c1_pack_s; - - obj_t b1, c1; - obj_t* a_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c1_pack = NULL; - - dim_t i; - dim_t b_alg; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_n( a, b, c ); - - // Initialize pack objects for A that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &a_pack_s ); - - // Initialize object for packing A. - bli_packm_init( a, &a_pack_s, - cntx, bli_cntl_sub_packm_a( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - a_pack = bli_thread_obroadcast( thread, &a_pack_s ); - - // Initialize pack objects for B and C that are passed into packm_init(). - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &b1_pack_s ); - bli_obj_init_pack( &c1_pack_s ); - } - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s ); - - // Pack A (if instructed). - bli_packm_int( a, a_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - dim_t my_start, my_end; - bli_thread_get_range_l2r( thread, b, - ( bli_obj_root_is_triangular( *b ) ? - bli_cntx_get_bmult( BLIS_MR, cntx ) : - bli_cntx_get_bmult( BLIS_NR, cntx ) ), - &my_start, &my_end ); - - // Partition along the n dimension. - for ( i = my_start; i < my_end; i += b_alg ) - { - // Determine the current algorithmic blocksize. - b_alg = bli_determine_blocksize_f( i, my_end, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for B1 and C1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, c, &c1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_init( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack C1 (if instructed). - bli_packm_int( &c1, c1_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a_pack, - b1_pack, - &BLIS_ONE, - c1_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - bli_thread_ibarrier( thread ); - - // Unpack C1 (if C1 was packed). - bli_unpackm_int( c1_pack, &c1, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - } - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - bli_thread_obarrier( thread ); - if( bli_thread_am_ochief( thread ) ) - bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) ); - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) ); - } -} - diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c new file mode 100644 index 000000000..7b428c8ef --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -0,0 +1,103 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_blk_var3 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + dir_t direct; + + dim_t i; + dim_t b_alg; + dim_t k_trans; + + // Determine the direction in which to partition (forwards or backwards). + direct = bli_l3_direct( a, b, c, cntx ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntx ); + + // Query dimension in partitioning direction. + k_trans = bli_obj_width_after_trans( *a ); + + // Partition along the k dimension. + for ( i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1, + &b1, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + //bli_thread_ibarrier( thread ); + bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal alpha scalars on A/B and C are non-zero, we must ensure + // that they are only used in the first iteration. + if ( i == 0 ) + { + bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c ); + } + } +} + diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c deleted file mode 100644 index 52cfb1fc5..000000000 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) -{ - obj_t c_pack_s; - obj_t a1_pack_s, b1_pack_s; - - obj_t a1, b1; - obj_t* a1_pack = NULL; - obj_t* b1_pack = NULL; - obj_t* c_pack = NULL; - - dim_t i; - dim_t b_alg; - dim_t k_trans; - - // Prune any zero region that exists along the partitioning dimension. - bli_trsm_prune_unref_mparts_k( a, b, c ); - - // Initialize pack objects for C that are passed into packm_init(). - if( bli_thread_am_ochief( thread ) ) { - bli_obj_init_pack( &c_pack_s ); - - // Initialize object for packing C. - bli_packm_init( c, &c_pack_s, - cntx, bli_cntl_sub_packm_c( cntl ) ); - - // Scale C by beta (if instructed). - bli_scalm_int( &BLIS_ONE, - c, - cntx, bli_cntl_sub_scalm( cntl ) ); - } - c_pack = bli_thread_obroadcast( thread, &c_pack_s ); - - if( bli_thread_am_ichief( thread ) ) { - bli_obj_init_pack( &a1_pack_s ); - bli_obj_init_pack( &b1_pack_s ); - } - a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s ); - b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s ); - - // Pack C (if instructed). - bli_packm_int( c, c_pack, - cntx, bli_cntl_sub_packm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // Query dimension in partitioning direction. - k_trans = bli_obj_width_after_trans( *a ); - - // Partition along the k dimension. - for ( i = 0; i < k_trans; i += b_alg ) - { - // Determine the current algorithmic blocksize. - // NOTE: We call a trsm-specific function to determine the kc - // blocksize so that we can implement the "nudging" of kc to be - // a multiple of mr, as needed. - b_alg = bli_trsm_determine_kc_f( i, k_trans, a, b, - bli_cntl_bszid( cntl ), cntx ); - - // Acquire partitions for A1 and B1. - bli_acquire_mpart_l2r( BLIS_SUBPART1, - i, b_alg, a, &a1 ); - bli_acquire_mpart_t2b( BLIS_SUBPART1, - i, b_alg, b, &b1 ); - - // Initialize objects for packing A1 and B1. - if( bli_thread_am_ichief( thread ) ) { - bli_packm_init( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_init( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ) ); - } - bli_thread_ibarrier( thread ); - - // Pack A1 (if instructed). - bli_packm_int( &a1, a1_pack, - cntx, bli_cntl_sub_packm_a( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Pack B1 (if instructed). - bli_packm_int( &b1, b1_pack, - cntx, bli_cntl_sub_packm_b( cntl ), - bli_thrinfo_sub_ipackm( thread ) ); - - // Perform trsm subproblem. - bli_trsm_int( &BLIS_ONE, - a1_pack, - b1_pack, - &BLIS_ONE, - c_pack, - cntx, - bli_cntl_sub_trsm( cntl ), - bli_thrinfo_sub_self( thread ) ); - - // This variant executes multiple rank-k updates. Therefore, if the - // internal alpha scalars on A/B and C are non-zero, we must ensure - // that they are only used in the first iteration. - bli_thread_ibarrier( thread ); - if ( i == 0 && bli_thread_am_ichief( thread ) ) { - bli_obj_scalar_reset( a ); - bli_obj_scalar_reset( b ); - bli_obj_scalar_reset( c_pack ); - } - } - - bli_thread_obarrier( thread ); - - // Unpack C (if C was packed). - bli_unpackm_int( c_pack, c, - cntx, bli_cntl_sub_unpackm_c( cntl ), - bli_thrinfo_sub_opackm( thread ) ); - - // If any packing buffers were acquired within packm, release them back - // to the memory manager. - if( bli_thread_am_ochief( thread ) ) { - bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) ); - } - if( bli_thread_am_ichief( thread ) ) { - bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) ); - bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) ); - } -} - diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 3a83faafd..78bd5eeb9 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -34,235 +34,191 @@ #include "blis.h" -extern scalm_t* scalm_cntl; - -extern gemm_t* gemm_cntl_bp_ke; - -packm_t* trsm_l_packa_cntl = NULL; -packm_t* trsm_l_packb_cntl = NULL; - -packm_t* trsm_r_packa_cntl = NULL; -packm_t* trsm_r_packb_cntl = NULL; - -trsm_t* trsm_cntl_bp_ke = NULL; - -trsm_t* trsm_l_cntl_op_bp = NULL; -trsm_t* trsm_l_cntl_mm_op = NULL; -trsm_t* trsm_l_cntl_vl_mm = NULL; - -trsm_t* trsm_r_cntl_op_bp = NULL; -trsm_t* trsm_r_cntl_mm_op = NULL; -trsm_t* trsm_r_cntl_vl_mm = NULL; - -trsm_t* trsm_l_cntl = NULL; -trsm_t* trsm_r_cntl = NULL; - - -void bli_trsm_cntl_init() +cntl_t* bli_trsm_cntl_create + ( + side_t side + ) { - - // Create control tree objects for packm operations (left side). - trsm_l_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: n dim multiple must be mr to - // support right and bottom-right edge cases - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - TRUE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_l_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - // IMPORTANT: m dim multiple must be mr since - // B_pack is updated (ie: serves as C) in trsm - BLIS_MR, - BLIS_NR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - // Create control tree objects for packm operations (right side). - trsm_r_packa_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_NR, - BLIS_MR, - FALSE, // do NOT invert diagonal - FALSE, // reverse iteration if upper? - FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK ); - - trsm_r_packb_cntl - = - bli_packm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, // pack panels of B compactly - BLIS_MR, - BLIS_MR, - TRUE, // invert diagonal - FALSE, // reverse iteration if upper? - TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL ); - - - // Create control tree object for lowest-level block-panel kernel. - trsm_cntl_bp_ke - = - bli_trsm_cntl_obj_create( BLIS_UNB_OPT, - BLIS_VARIANT2, - 0, // bszid_t not used by macro-kernel - NULL, NULL, NULL, NULL, - NULL, NULL, NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (left side). - trsm_l_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_l_packa_cntl, - trsm_l_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (left side). - trsm_l_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (left side). - trsm_l_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_l_cntl_mm_op, - NULL, - NULL ); - - // Create control tree object for outer panel (to block-panel) - // problem (right side). - trsm_r_cntl_op_bp - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT1, - BLIS_MC, - NULL, - trsm_r_packa_cntl, - trsm_r_packb_cntl, - NULL, - trsm_cntl_bp_ke, - gemm_cntl_bp_ke, - NULL ); - - // Create control tree object for general problem via multiple - // rank-k (outer panel) updates (right side). - trsm_r_cntl_mm_op - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT3, - BLIS_KC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_op_bp, - NULL, - NULL ); - - // Create control tree object for very large problem via multiple - // general problems (right side). - trsm_r_cntl_vl_mm - = - bli_trsm_cntl_obj_create( BLIS_BLOCKED, - BLIS_VARIANT2, - BLIS_NC, - NULL, - NULL, - NULL, - NULL, - trsm_r_cntl_mm_op, - NULL, - NULL ); - - // Alias the "master" trsm control trees to shorter names. - trsm_l_cntl = trsm_l_cntl_vl_mm; - trsm_r_cntl = trsm_r_cntl_vl_mm; + if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); + else return bli_trsm_r_cntl_create(); } -void bli_trsm_cntl_finalize() +cntl_t* bli_trsm_l_cntl_create + ( + void + ) { - bli_cntl_obj_free( trsm_l_packa_cntl ); - bli_cntl_obj_free( trsm_l_packb_cntl ); - bli_cntl_obj_free( trsm_r_packa_cntl ); - bli_cntl_obj_free( trsm_r_packb_cntl ); + void* macro_kernel_p = bli_trsm_xx_ker_var2; - bli_cntl_obj_free( trsm_cntl_bp_ke ); + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); - bli_cntl_obj_free( trsm_l_cntl_op_bp ); - bli_cntl_obj_free( trsm_l_cntl_mm_op ); - bli_cntl_obj_free( trsm_l_cntl_vl_mm ); - bli_cntl_obj_free( trsm_r_cntl_op_bp ); - bli_cntl_obj_free( trsm_r_cntl_mm_op ); - bli_cntl_obj_free( trsm_r_cntl_vl_mm ); + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); + + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_bu + ); + + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; } -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_packm_a, - packm_t* sub_packm_b, - packm_t* sub_packm_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpackm_c ) +cntl_t* bli_trsm_r_cntl_create + ( + void + ) { - trsm_t* cntl; + void* macro_kernel_p = bli_trsm_xx_ker_var2; - cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + // Create two nodes for the macro-kernel. + cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); - cntl->impl_type = impl_type; - cntl->var_num = var_num; - cntl->bszid = bszid; - cntl->sub_scalm = sub_scalm; - cntl->sub_packm_a = sub_packm_a; - cntl->sub_packm_b = sub_packm_b; - cntl->sub_packm_c = sub_packm_c; - cntl->sub_trsm = sub_trsm; - cntl->sub_gemm = sub_gemm; - cntl->sub_unpackm_c = sub_unpackm_c; + cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + trsm_cntl_bu_ke + ); - return cntl; + // Create a node for packing matrix A. + cntl_t* trsm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_trsm_packa, + bli_packm_blk_var1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + trsm_cntl_bp_bu + ); + + // Create a node for partitioning the m dimension by MC. + cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_obj_create + ( + BLIS_MC, + bli_trsm_blk_var1, + trsm_cntl_packa + ); + + // Create a node for packing matrix B. + cntl_t* trsm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_trsm_packb, + bli_packm_blk_var1, + BLIS_MR, + BLIS_MR, + TRUE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + trsm_cntl_op_bp + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_obj_create + ( + BLIS_KC, + bli_trsm_blk_var3, + trsm_cntl_packb + ); + + // Create a node for partitioning the n dimension by NC. + cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_obj_create + ( + BLIS_NC, + bli_trsm_blk_var2, + trsm_cntl_mm_op + ); + + return trsm_cntl_vl_mm; +} + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bli_cntl_free( cntl, thread ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ) +{ + return bli_cntl_obj_create( bszid, var_func, NULL, sub_node ); } diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index 651cc8599..6dbe9adce 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -32,33 +32,33 @@ */ -struct trsm_s -{ - impl_t impl_type; - varnum_t var_num; - bszid_t bszid; - struct scalm_s* sub_scalm; - struct packm_s* sub_packm_a; - struct packm_s* sub_packm_b; - struct packm_s* sub_packm_c; - struct trsm_s* sub_trsm; - struct gemm_s* sub_gemm; - struct unpackm_s* sub_unpackm_c; -}; -typedef struct trsm_s trsm_t; +cntl_t* bli_trsm_cntl_create + ( + side_t side + ); -#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm +cntl_t* bli_trsm_l_cntl_create + ( + void + ); -void bli_trsm_cntl_init( void ); -void bli_trsm_cntl_finalize( void ); -trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bszid, - scalm_t* sub_scalm, - packm_t* sub_pack_a, - packm_t* sub_pack_b, - packm_t* sub_pack_c, - trsm_t* sub_trsm, - gemm_t* sub_gemm, - unpackm_t* sub_unpack_c ); +cntl_t* bli_trsm_r_cntl_create + ( + void + ); + +void bli_trsm_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +// ----------------------------------------------------------------------------- + +cntl_t* bli_trsm_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + cntl_t* sub_node + ); diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 53cdbb1c8..95c2d6aab 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -34,15 +34,16 @@ #include "blis.h" -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ) +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ) { - trsm_t* cntl; obj_t a_local; obj_t b_local; obj_t c_local; @@ -60,7 +61,7 @@ void bli_trsm_front( side_t side, // Reinitialize the memory allocator to accommodate the blocksizes // in the current context. - bli_mem_reinit( cntx ); + bli_memsys_reinit( cntx ); // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( *a, a_local ); @@ -115,26 +116,23 @@ void bli_trsm_front( side_t side, bli_obj_set_as_root( b_local ); bli_obj_set_as_root( c_local ); - // Choose the control tree. - if ( bli_is_left( side ) ) cntl = l_cntl; - else cntl = r_cntl; + // Set the operation family id in the context. + bli_cntx_set_family( BLIS_TRSM, cntx ); - thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_TRSM, side ); - dim_t n_threads = bli_thread_num_threads( infos[0] ); - - // Invoke the internal back-end. - bli_l3_thread_decorator( n_threads, - (l3_int_t) bli_trsm_int, - alpha, - &a_local, - &b_local, - alpha, - &c_local, - (void*) cntx, - (void*) cntl, - (void**) infos ); - - bli_l3_thrinfo_free_paths( infos, n_threads ); + // Record the threading for each level within the context. + bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx ); + // Invoke the internal back-end. + bli_l3_thread_decorator + ( + bli_trsm_int, + alpha, + &a_local, + &b_local, + alpha, + &c_local, + cntx, + cntl + ); } diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h index 6ee063797..84feef22f 100644 --- a/frame/3/trsm/bli_trsm_front.h +++ b/frame/3/trsm/bli_trsm_front.h @@ -32,11 +32,12 @@ */ -void bli_trsm_front( side_t side, - obj_t* alpha, - obj_t* a, - obj_t* b, - cntx_t* cntx, - trsm_t* l_cntl, - trsm_t* r_cntl ); - +void bli_trsm_front + ( + side_t side, + obj_t* alpha, + obj_t* a, + obj_t* b, + cntx_t* cntx, + cntl_t* cntl + ); diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index 123ef6585..796af7866 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -34,73 +34,22 @@ #include "blis.h" -#define FUNCPTR_T trsm_fp - -typedef void (*FUNCPTR_T)( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); - -static FUNCPTR_T vars[2][2][4][3] = -{ - // left - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ll_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_lu_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - } - }, - // right - { - // lower - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1b }, - { NULL, bli_trsm_rl_ker_var2, bli_trsm_blk_var2b }, - { NULL, NULL, bli_trsm_blk_var3b }, - { NULL, NULL, NULL, }, - }, - // upper - { - // unblocked optimized unblocked blocked - { NULL, NULL, bli_trsm_blk_var1f }, - { NULL, bli_trsm_ru_ker_var2, bli_trsm_blk_var2f }, - { NULL, NULL, bli_trsm_blk_var3f }, - { NULL, NULL, NULL, }, - } - } -}; - -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { obj_t a_local; obj_t b_local; obj_t c_local; - bool_t side, uplo; - varnum_t n; - impl_t i; - FUNCPTR_T f; + trsm_voft f; // Check parameters. if ( bli_error_checking_is_enabled() ) @@ -113,9 +62,9 @@ void bli_trsm_int( obj_t* alpha, if ( bli_obj_has_zero_dim( *a ) || bli_obj_has_zero_dim( *b ) ) { - if( bli_thread_am_ochief( thread ) ) - bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + if ( bli_thread_am_ochief( thread ) ) + bli_scalm( beta, c ); + bli_thread_obarrier( thread ); return; } @@ -133,14 +82,14 @@ void bli_trsm_int( obj_t* alpha, // packed, this is our last chance to handle the transposition. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) ) { - bli_obj_induce_trans( c_local ); - bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); + bli_obj_induce_trans( c_local ); + bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local ); } // If beta is non-unit, apply it to the scalar attached to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( beta, &c_local ); + bli_obj_scalar_apply_scalar( beta, &c_local ); } // Set two bools: one based on the implied side parameter (the structure @@ -148,24 +97,15 @@ void bli_trsm_int( obj_t* alpha, // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( *a ) ) { - side = 0; - if ( bli_obj_root_is_lower( *a ) ) uplo = 0; - else uplo = 1; - // If alpha is non-unit, typecast and apply it to the scalar // attached to B (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) { - bli_obj_scalar_apply_scalar( alpha, &b_local ); + bli_obj_scalar_apply_scalar( alpha, &b_local ); } } else // if ( bli_obj_root_is_triangular( *b ) ) { - side = 1; - // Set a bool based on the uplo field of A's root object. - if ( bli_obj_root_is_lower( *b ) ) uplo = 0; - else uplo = 1; - // If alpha is non-unit, typecast and apply it to the scalar // attached to A (the non-triangular matrix). if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) @@ -174,21 +114,24 @@ void bli_trsm_int( obj_t* alpha, } } - bli_thread_obarrier( thread ); + // FGVZ->TMS: Is this barrier still needed? + bli_thread_obarrier( thread ); - // Extract the variant number and implementation type. - n = bli_cntl_var_num( cntl ); - i = bli_cntl_impl_type( cntl ); + // Create the next node in the thrinfo_t structure. + bli_thrinfo_grow( cntx, cntl, thread ); - // Index into the variant array to extract the correct function pointer. - f = vars[side][uplo][n][i]; + // Extract the function pointer from the current control tree node. + f = bli_cntl_var_func( cntl ); // Invoke the variant. - f( &a_local, - &b_local, - &c_local, - cntx, - cntl, - thread ); + f + ( + &a_local, + &b_local, + &c_local, + cntx, + cntl, + thread + ); } diff --git a/frame/3/trsm/bli_trsm_int.h b/frame/3/trsm/bli_trsm_int.h index deecc6565..a147a3298 100644 --- a/frame/3/trsm/bli_trsm_int.h +++ b/frame/3/trsm/bli_trsm_int.h @@ -32,11 +32,15 @@ */ -void bli_trsm_int( obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ); +void bli_trsm_int + ( + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ); + diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index db4668d1f..b7d695318 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ll_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_ll_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 17041d986..763592644 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_lu_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_lu_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to B. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of B prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to B (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of B prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *b ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_packab.c b/frame/3/trsm/bli_trsm_packab.c new file mode 100644 index 000000000..3a32ce097 --- /dev/null +++ b/frame/3/trsm/bli_trsm_packab.c @@ -0,0 +1,110 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trsm_packa + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a_pack; + + // Pack matrix A according to the control tree node. + bli_l3_packm + ( + a, + &a_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix A. + bli_trsm_int + ( + &BLIS_ONE, + &a_pack, + b, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + +// ----------------------------------------------------------------------------- + +void bli_trsm_packb + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t b_pack; + + // Pack matrix B according to the control tree node. + bli_l3_packm + ( + b, + &b_pack, + cntx, + cntl, + thread + ); + + // Proceed with execution using packed matrix B. + bli_trsm_int + ( + &BLIS_ONE, + a, + &b_pack, + &BLIS_ONE, + c, + cntx, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); +} + diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 40f3d5511..a18e88939 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_rl_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_rl_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 6482fa777..f5dad161b 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -55,12 +55,15 @@ typedef void (*FUNCPTR_T)( static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - cntx_t* cntx, - trsm_t* cntl, - thrinfo_t* thread ) +void bli_trsm_ru_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) { num_t dt_exec = bli_obj_execution_datatype( *c ); @@ -93,10 +96,11 @@ void bli_trsm_ru_ker_var2( obj_t* a, FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar - // attached to A. This will be the alpha scalar used in the gemmtrsm - // subproblems (ie: the scalar that would be applied to the packed - // copy of A prior to it being updated by the trsm subproblem). This - // scalar may be unit, if for example it was applied during packing. + // attached to A (the non-triangular matrix). This will be the alpha + // scalar used in the gemmtrsm subproblems (ie: the scalar that would + // be applied to the packed copy of A prior to it being updated by + // the trsm subproblem). This scalar may be unit, if for example it + // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( *a ); // Grab the address of the internal scalar buffer for the scalar diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index 2a2c0efc8..2ff45fa13 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -46,17 +46,17 @@ void PASTEMAC0(opname) \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ - trsm_t* cntl, \ + cntl_t* cntl, \ thrinfo_t* thread \ ); -GENPROT( trsm_blk_var1f ) -GENPROT( trsm_blk_var1b ) -GENPROT( trsm_blk_var2f ) -GENPROT( trsm_blk_var2b ) -GENPROT( trsm_blk_var3f ) -GENPROT( trsm_blk_var3b ) +GENPROT( trsm_blk_var1 ) +GENPROT( trsm_blk_var2 ) +GENPROT( trsm_blk_var3 ) +GENPROT( trsm_packa ) +GENPROT( trsm_packb ) +GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) diff --git a/frame/1/packv/bli_packv_cntl.h b/frame/3/trsm/bli_trsm_xx_ker_var2.c similarity index 61% rename from frame/1/packv/bli_packv_cntl.h rename to frame/3/trsm/bli_trsm_xx_ker_var2.c index d4682f085..8409432ca 100644 --- a/frame/1/packv/bli_packv_cntl.h +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -32,32 +32,56 @@ */ -struct packv_s +#include "blis.h" + +static trsm_voft vars[2][2] = { - impl_t impl_type; - varnum_t var_num; - bszid_t bmid; - pack_t pack_schema; + { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, + { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } }; -typedef struct packv_s packv_t; -#define cntl_bmid( cntl ) cntl->bmid +void bli_trsm_xx_ker_var2 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + bool_t side; + bool_t uplo; + trsm_voft f; -#define bli_cntl_sub_packv( cntl ) cntl->sub_packv -#define bli_cntl_sub_packv_x( cntl ) cntl->sub_packv_x -#define bli_cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1 -#define bli_cntl_sub_packv_y( cntl ) cntl->sub_packv_y -#define bli_cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1 + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( *a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( *a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( *b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( *b ) ) uplo = 0; + else uplo = 1; + } -void bli_packv_cntl_init( void ); -void bli_packv_cntl_finalize( void ); -packv_t* bli_packv_cntl_obj_create( impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); -void bli_packv_cntl_obj_init( packv_t* cntl, - impl_t impl_type, - varnum_t var_num, - bszid_t bmid, - pack_t pack_schema ); + // Index into the variant array to extract the correct function pointer. + f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread + ); +} diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/old/bli_trsm_blk_var1b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var1b.c rename to frame/3/trsm/old/bli_trsm_blk_var1b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var1b.h b/frame/3/trsm/old/bli_trsm_blk_var1b.h deleted file mode 100644 index 77601bb76..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var1b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var1b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_blk_var1f.h b/frame/3/trsm/old/bli_trsm_blk_var1f.h deleted file mode 100644 index df5a9d3fd..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var1f.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var1f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/old/bli_trsm_blk_var2b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var2b.c rename to frame/3/trsm/old/bli_trsm_blk_var2b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var2b.h b/frame/3/trsm/old/bli_trsm_blk_var2b.h deleted file mode 100644 index d890990e7..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var2b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var2b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/old/bli_trsm_blk_var3b.c similarity index 100% rename from frame/3/trsm/bli_trsm_blk_var3b.c rename to frame/3/trsm/old/bli_trsm_blk_var3b.c diff --git a/frame/3/trsm/old/bli_trsm_blk_var3b.h b/frame/3/trsm/old/bli_trsm_blk_var3b.h deleted file mode 100644 index 5cab7bdcf..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3b.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3b( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_blk_var3f.h b/frame/3/trsm/old/bli_trsm_blk_var3f.h deleted file mode 100644 index 2c6fbb214..000000000 --- a/frame/3/trsm/old/bli_trsm_blk_var3f.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -void bli_trsm_blk_var3f( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - diff --git a/frame/3/trsm/old/bli_trsm_cntl.c b/frame/3/trsm/old/bli_trsm_cntl.c new file mode 100644 index 000000000..3a83faafd --- /dev/null +++ b/frame/3/trsm/old/bli_trsm_cntl.c @@ -0,0 +1,268 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +extern scalm_t* scalm_cntl; + +extern gemm_t* gemm_cntl_bp_ke; + +packm_t* trsm_l_packa_cntl = NULL; +packm_t* trsm_l_packb_cntl = NULL; + +packm_t* trsm_r_packa_cntl = NULL; +packm_t* trsm_r_packb_cntl = NULL; + +trsm_t* trsm_cntl_bp_ke = NULL; + +trsm_t* trsm_l_cntl_op_bp = NULL; +trsm_t* trsm_l_cntl_mm_op = NULL; +trsm_t* trsm_l_cntl_vl_mm = NULL; + +trsm_t* trsm_r_cntl_op_bp = NULL; +trsm_t* trsm_r_cntl_mm_op = NULL; +trsm_t* trsm_r_cntl_vl_mm = NULL; + +trsm_t* trsm_l_cntl = NULL; +trsm_t* trsm_r_cntl = NULL; + + +void bli_trsm_cntl_init() +{ + + // Create control tree objects for packm operations (left side). + trsm_l_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: n dim multiple must be mr to + // support right and bottom-right edge cases + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + TRUE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_l_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + // IMPORTANT: m dim multiple must be mr since + // B_pack is updated (ie: serves as C) in trsm + BLIS_MR, + BLIS_NR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + // Create control tree objects for packm operations (right side). + trsm_r_packa_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_NR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK ); + + trsm_r_packb_cntl + = + bli_packm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, // pack panels of B compactly + BLIS_MR, + BLIS_MR, + TRUE, // invert diagonal + FALSE, // reverse iteration if upper? + TRUE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL ); + + + // Create control tree object for lowest-level block-panel kernel. + trsm_cntl_bp_ke + = + bli_trsm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT2, + 0, // bszid_t not used by macro-kernel + NULL, NULL, NULL, NULL, + NULL, NULL, NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (left side). + trsm_l_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_l_packa_cntl, + trsm_l_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (left side). + trsm_l_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (left side). + trsm_l_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_l_cntl_mm_op, + NULL, + NULL ); + + // Create control tree object for outer panel (to block-panel) + // problem (right side). + trsm_r_cntl_op_bp + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + BLIS_MC, + NULL, + trsm_r_packa_cntl, + trsm_r_packb_cntl, + NULL, + trsm_cntl_bp_ke, + gemm_cntl_bp_ke, + NULL ); + + // Create control tree object for general problem via multiple + // rank-k (outer panel) updates (right side). + trsm_r_cntl_mm_op + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + BLIS_KC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_op_bp, + NULL, + NULL ); + + // Create control tree object for very large problem via multiple + // general problems (right side). + trsm_r_cntl_vl_mm + = + bli_trsm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + BLIS_NC, + NULL, + NULL, + NULL, + NULL, + trsm_r_cntl_mm_op, + NULL, + NULL ); + + // Alias the "master" trsm control trees to shorter names. + trsm_l_cntl = trsm_l_cntl_vl_mm; + trsm_r_cntl = trsm_r_cntl_vl_mm; +} + +void bli_trsm_cntl_finalize() +{ + bli_cntl_obj_free( trsm_l_packa_cntl ); + bli_cntl_obj_free( trsm_l_packb_cntl ); + bli_cntl_obj_free( trsm_r_packa_cntl ); + bli_cntl_obj_free( trsm_r_packb_cntl ); + + bli_cntl_obj_free( trsm_cntl_bp_ke ); + + bli_cntl_obj_free( trsm_l_cntl_op_bp ); + bli_cntl_obj_free( trsm_l_cntl_mm_op ); + bli_cntl_obj_free( trsm_l_cntl_vl_mm ); + bli_cntl_obj_free( trsm_r_cntl_op_bp ); + bli_cntl_obj_free( trsm_r_cntl_mm_op ); + bli_cntl_obj_free( trsm_r_cntl_vl_mm ); +} + +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_packm_a, + packm_t* sub_packm_b, + packm_t* sub_packm_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpackm_c ) +{ + trsm_t* cntl; + + cntl = ( trsm_t* ) bli_malloc_intl( sizeof(trsm_t) ); + + cntl->impl_type = impl_type; + cntl->var_num = var_num; + cntl->bszid = bszid; + cntl->sub_scalm = sub_scalm; + cntl->sub_packm_a = sub_packm_a; + cntl->sub_packm_b = sub_packm_b; + cntl->sub_packm_c = sub_packm_c; + cntl->sub_trsm = sub_trsm; + cntl->sub_gemm = sub_gemm; + cntl->sub_unpackm_c = sub_unpackm_c; + + return cntl; +} + diff --git a/frame/1m/unpackm/bli_unpackm_blk_var2.h b/frame/3/trsm/old/bli_trsm_cntl.h similarity index 61% rename from frame/1m/unpackm/bli_unpackm_blk_var2.h rename to frame/3/trsm/old/bli_trsm_cntl.h index 1f783260a..651cc8599 100644 --- a/frame/1m/unpackm/bli_unpackm_blk_var2.h +++ b/frame/3/trsm/old/bli_trsm_cntl.h @@ -32,30 +32,33 @@ */ -void bli_unpackm_blk_var2( obj_t* p, - obj_t* c, - cntx_t* cntx, - unpackm_t* cntl ); +struct trsm_s +{ + impl_t impl_type; + varnum_t var_num; + bszid_t bszid; + struct scalm_s* sub_scalm; + struct packm_s* sub_packm_a; + struct packm_s* sub_packm_b; + struct packm_s* sub_packm_c; + struct trsm_s* sub_trsm; + struct gemm_s* sub_gemm; + struct unpackm_s* sub_unpackm_c; +}; +typedef struct trsm_s trsm_t; +#define bli_cntl_sub_trsm( cntl ) cntl->sub_trsm -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - trans_t transc, \ - dim_t m, \ - dim_t n, \ - dim_t m_panel, \ - dim_t n_panel, \ - void* p, inc_t rs_p, inc_t cs_p, \ - dim_t pd_p, inc_t ps_p, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROT_BASIC( unpackm_blk_var2 ) +void bli_trsm_cntl_init( void ); +void bli_trsm_cntl_finalize( void ); +trsm_t* bli_trsm_cntl_obj_create( impl_t impl_type, + varnum_t var_num, + bszid_t bszid, + scalm_t* sub_scalm, + packm_t* sub_pack_a, + packm_t* sub_pack_b, + packm_t* sub_pack_c, + trsm_t* sub_trsm, + gemm_t* sub_gemm, + unpackm_t* sub_unpack_c ); diff --git a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h b/frame/3/trsm/old/bli_trsm_ll_ker_var2.h deleted file mode 100644 index 09812df14..000000000 --- a/frame/3/trsm/old/bli_trsm_ll_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ll_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ll_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h b/frame/3/trsm/old/bli_trsm_lu_ker_var2.h deleted file mode 100644 index aa7c8ed47..000000000 --- a/frame/3/trsm/old/bli_trsm_lu_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_lu_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_lu_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h b/frame/3/trsm/old/bli_trsm_rl_ker_var2.h deleted file mode 100644 index 0fd7e6bbe..000000000 --- a/frame/3/trsm/old/bli_trsm_rl_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_rl_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_rl_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h b/frame/3/trsm/old/bli_trsm_ru_ker_var2.h deleted file mode 100644 index a30e20070..000000000 --- a/frame/3/trsm/old/bli_trsm_ru_ker_var2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - -// -// Prototype object-based interface. -// -void bli_trsm_ru_ker_var2( obj_t* a, - obj_t* b, - obj_t* c, - trsm_t* cntl, - thrinfo_t* thread ); - - -// -// Prototype BLAS-like interfaces. -// -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname)( \ - doff_t diagoffa, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha1, \ - void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ - void* alpha2, \ - void* c, inc_t rs_c, inc_t cs_c, \ - void* gemmtrsm_ukr, \ - cntx_t* cntx, \ - void* gemm_ukr, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC( trsm_ru_ker_var2 ) - diff --git a/frame/3/trsm/old/bli_trsm_thread.c b/frame/3/trsm/old/bli_trsm_thread.c deleted file mode 100644 index 1a9f4ec16..000000000 --- a/frame/3/trsm/old/bli_trsm_thread.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" -#include "assert.h" - -#if 0 -thrinfo_t** bli_trsm_thrinfo_create_paths( bool_t right_sided ) -{ - -#ifdef BLIS_ENABLE_MULTITHREADING - dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" ); -// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" ); - dim_t kc_way = 1; - dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" ); - dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" ); - dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" ); - - if ( right_sided ) - { - ic_way = jc_way * ic_way * jr_way; - - jc_way = 1; - kc_way = 1; - jr_way = 1; - ir_way = 1; - } - else - { - jr_way = ic_way * jr_way * ir_way; - - jc_way = 1; - kc_way = 1; - ic_way = 1; - ir_way = 1; - } -#else - dim_t jc_way = 1; - dim_t kc_way = 1; - dim_t ic_way = 1; - dim_t jr_way = 1; - dim_t ir_way = 1; -#endif - - dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way; - assert( global_num_threads != 0 ); - - dim_t jc_nt = kc_way * ic_way * jr_way * ir_way; - dim_t kc_nt = ic_way * jr_way * ir_way; - dim_t ic_nt = jr_way * ir_way; - dim_t jr_nt = ir_way; - dim_t ir_nt = 1; - - - thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) ); - - thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads ); - for( int a = 0; a < jc_way; a++ ) - { - thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt ); - for( int b = 0; b < kc_way; b++ ) - { - thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt ); - for( int c = 0; c < ic_way; c++ ) - { - thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt ); - for( int d = 0; d < jr_way; d++ ) - { - thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt ); - for( int e = 0; e < ir_way; e++ ) - { - thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt ); - dim_t ir_comm_id = 0; - dim_t jr_comm_id = e*ir_nt + ir_comm_id; - dim_t ic_comm_id = d*jr_nt + jr_comm_id; - dim_t kc_comm_id = c*ic_nt + ic_comm_id; - dim_t jc_comm_id = b*kc_nt + kc_comm_id; - dim_t global_comm_id = a*jc_nt + jc_comm_id; - - // Macrokernel loops - thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id, - ir_comm, ir_comm_id, - ir_way, e, - NULL, NULL, NULL); - - thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - jr_way, d, - NULL, NULL, ir_info); - //blk_var_1 - packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); - - packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - ic_way, c, - pack_ic_out, pack_ic_in, jr_info); - //blk_var_3 - packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - jc_comm, jc_comm_id, - jc_nt, jc_comm_id ); - - thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - kc_way, b, - pack_kc_out, pack_kc_in, ic_info); - //blk_var_2 - packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id, - kc_comm, kc_comm_id, - jc_nt, jc_comm_id ); - - packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id, - jc_comm, jc_comm_id, - global_num_threads, global_comm_id ); - - thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id, - jc_comm, jc_comm_id, - jc_way, a, - pack_jc_out, pack_jc_in, kc_info); - - paths[global_comm_id] = jc_info; - } - } - } - } - } - return paths; -} -#endif diff --git a/frame/include/bli_auxinfo_macro_defs.h b/frame/base/bli_auxinfo.h similarity index 100% rename from frame/include/bli_auxinfo_macro_defs.h rename to frame/base/bli_auxinfo.h diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index e7bd0be2a..833dadb42 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -35,10 +35,13 @@ #include "blis.h" -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { blksz_t* b; @@ -53,11 +56,14 @@ blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, return b; } -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ) +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ) { b->v[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b_d; @@ -69,15 +75,21 @@ void bli_blksz_obj_init( blksz_t* b, b->e[BLIS_DCOMPLEX] = be_z; } -void bli_blksz_obj_free( blksz_t* b ) +void bli_blksz_obj_free + ( + blksz_t* b + ) { bli_free_intl( b ); } // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ) +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ) { dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); @@ -107,11 +119,30 @@ void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + if ( direct == BLIS_FWD ) + return bli_determine_blocksize_f( i, dim, obj, bszid, cntx ); + else + return bli_determine_blocksize_b( i, dim, obj, bszid, cntx ); +} + +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) { num_t dt; blksz_t* bsize; @@ -130,10 +161,39 @@ dim_t bli_determine_blocksize_f( dim_t i, return b_use; } -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ) +{ + num_t dt; + blksz_t* bsize; + dim_t b_alg, b_max; + dim_t b_use; + + // Extract the execution datatype and use it to query the corresponding + // blocksize and blocksize maximum values from the blksz_t object. + dt = bli_obj_execution_datatype( *obj ); + bsize = bli_cntx_get_blksz( bszid, cntx ); + b_alg = bli_blksz_get_def( dt, bsize ); + b_max = bli_blksz_get_max( dt, bsize ); + + b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); + + return b_use; +} + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; @@ -161,33 +221,13 @@ dim_t bli_determine_blocksize_f_sub( dim_t i, return b_now; } -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ) -{ - num_t dt; - blksz_t* bsize; - dim_t b_alg, b_max; - dim_t b_use; - - // Extract the execution datatype and use it to query the corresponding - // blocksize and blocksize maximum values from the blksz_t object. - dt = bli_obj_execution_datatype( *obj ); - bsize = bli_cntx_get_blksz( bszid, cntx ); - b_alg = bli_blksz_get_def( dt, bsize ); - b_max = bli_blksz_get_max( dt, bsize ); - - b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); - - return b_use; -} - -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ) +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ) { dim_t b_now; dim_t dim_left_now; diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h index 11a8cb650..daffb3772 100644 --- a/frame/base/bli_blksz.h +++ b/frame/base/bli_blksz.h @@ -97,43 +97,79 @@ // ----------------------------------------------------------------------------- -blksz_t* bli_blksz_obj_create( dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +blksz_t* bli_blksz_obj_create + ( + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_init( blksz_t* b, - dim_t b_s, dim_t be_s, - dim_t b_d, dim_t be_d, - dim_t b_c, dim_t be_c, - dim_t b_z, dim_t be_z ); +void bli_blksz_obj_init + ( + blksz_t* b, + dim_t b_s, dim_t be_s, + dim_t b_d, dim_t be_d, + dim_t b_c, dim_t be_c, + dim_t b_z, dim_t be_z + ); -void bli_blksz_obj_free( blksz_t* b ); +void bli_blksz_obj_free + ( + blksz_t* b + ); // ----------------------------------------------------------------------------- -void bli_blksz_reduce_dt_to( num_t dt_bm, blksz_t* bmult, - num_t dt_bs, blksz_t* blksz ); +void bli_blksz_reduce_dt_to + ( + num_t dt_bm, blksz_t* bmult, + num_t dt_bs, blksz_t* blksz + ); // ----------------------------------------------------------------------------- -dim_t bli_determine_blocksize_f( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_f_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize + ( + dir_t direct, + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); -dim_t bli_determine_blocksize_b( dim_t i, - dim_t dim, - obj_t* obj, - bszid_t bszid, - cntx_t* cntx ); -dim_t bli_determine_blocksize_b_sub( dim_t i, - dim_t dim, - dim_t b_alg, - dim_t b_max ); +dim_t bli_determine_blocksize_f + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_b + ( + dim_t i, + dim_t dim, + obj_t* obj, + bszid_t bszid, + cntx_t* cntx + ); + +dim_t bli_determine_blocksize_f_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); + +dim_t bli_determine_blocksize_b_sub + ( + dim_t i, + dim_t dim, + dim_t b_alg, + dim_t b_max + ); diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c new file mode 100644 index 000000000..2b45a5de3 --- /dev/null +++ b/frame/base/bli_cntl.c @@ -0,0 +1,190 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ) +{ + cntl_t* cntl; + mem_t* pack_mem; + + // Allocate the cntl_t struct. + cntl = bli_malloc_intl( sizeof( cntl_t ) ); + + bli_cntl_set_bszid( bszid, cntl ); + bli_cntl_set_var_func( var_func, cntl ); + bli_cntl_set_params( params, cntl ); + bli_cntl_set_sub_node( sub_node, cntl ); + + // Query the address of the node's packed mem_t entry so we can initialize + // key fields (to NULL or 0). + // NOTE: This initialization is important, since it allows threads to + // discern whether blocks have been acquired from the memory allocator. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); + + return cntl; +} + +void bli_cntl_obj_free + ( + cntl_t* cntl + ) +{ + bli_free_intl( cntl ); +} + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ) +{ + mem_t* pack_mem; + + // Clear various fields in the control tree. Clearing these fields + // actually is not needed, but we do it for debugging/completeness. + bli_cntl_set_var_func( NULL, cntl ); + bli_cntl_set_params( NULL, cntl ); + bli_cntl_set_sub_node( NULL, cntl ); + + // Clearing these fields is potentially more important if the control + // tree is cached somewhere and reused. + pack_mem = bli_cntl_pack_mem( cntl ); + bli_mem_clear( pack_mem ); +} + +// ----------------------------------------------------------------------------- + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + thrinfo_t* thread_sub_node = bli_thrinfo_sub_node( thread ); + + // Only recurse if the current thrinfo_t node has a child. + if ( thread_sub_node != NULL ) + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free( cntl_sub_node, thread_sub_node ); + } + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the current thread + // is chief for its group, and only if the mem_t is allocated. + if ( bli_thread_am_ochief( thread ) ) + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + +// ----------------------------------------------------------------------------- + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ) +{ + // Make a copy of the current node. Notice that the source node + // should NOT have any allocated/cached mem_t entries, and that + // bli_cntl_obj_create() creates a node with a cleared mem_t + // field. + cntl_t* cntl_copy = bli_cntl_obj_create + ( + bli_cntl_bszid( cntl ), + bli_cntl_var_func( cntl ), + NULL, NULL + ); + + // Check the params field of the existing control tree; if it's non-NULL, + // copy it. + if ( bli_cntl_params( cntl ) != NULL ) + { + // Detect the size of the params struct by reading the first field + // as a uint64_t, and then allocate this many bytes for a new params + // struct. + uint64_t params_size = bli_cntl_params_size( cntl ); + void* params_orig = bli_cntl_params( cntl ); + void* params_copy = bli_malloc_intl( ( size_t )params_size ); + + // Copy the original params struct to the new memory region. + memcpy( params_copy, params_orig, params_size ); + + // Save the address of the new params struct into the new control + // tree node. + bli_cntl_set_params( params_copy, cntl_copy ); + } + + // If the sub-node exists, copy it recursively. + if ( bli_cntl_sub_node( cntl ) != NULL ) + { + cntl_t* sub_node_copy = bli_cntl_copy + ( + bli_cntl_sub_node( cntl ) + ); + + // Save the address of the new sub-node (sub-tree) to the existing + // node. + bli_cntl_set_sub_node( sub_node_copy, cntl_copy ); + } + + // Return the address of the newly created node. + return cntl_copy; +} + diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h new file mode 100644 index 000000000..7b6000bb9 --- /dev/null +++ b/frame/base/bli_cntl.h @@ -0,0 +1,153 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +/* +// -- Control tree node definition -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; +*/ + + +// -- Control tree prototypes -- + +cntl_t* bli_cntl_obj_create + ( + bszid_t bszid, + void* var_func, + void* params, + cntl_t* sub_node + ); + +void bli_cntl_obj_free + ( + cntl_t* cntl + ); + +void bli_cntl_obj_clear + ( + cntl_t* cntl + ); + +void bli_cntl_free + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +cntl_t* bli_cntl_copy + ( + cntl_t* cntl + ); + +// ----------------------------------------------------------------------------- + +// cntl_t query (fields only) + +#define bli_cntl_bszid( cntl ) \ +\ + ( cntl->bszid ) + +#define bli_cntl_var_func( cntl ) \ +\ + ( cntl->var_func ) + +#define bli_cntl_sub_node( cntl ) \ +\ + ( cntl->sub_node ) + +#define bli_cntl_params( cntl ) \ +\ + ( cntl->params ) + +#define bli_cntl_params_size( cntl ) \ +\ + ( *( ( uint64_t* )(cntl->params) ) ) + +#define bli_cntl_pack_mem( cntl ) \ +\ + ( &(cntl->pack_mem) ) + +// cntl_t query (complex) + +#define bli_cntl_is_leaf( cntl ) \ +\ + ( bli_cntl_sub_node( cntl ) == NULL ) + +#define bli_cntl_does_part( cntl ) \ +\ + ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ) + +// cntl_t modification + +#define bli_cntl_set_bszid( bszid0, cntl ) \ +{ \ + cntl->bszid = bszid0; \ +} + +#define bli_cntl_set_var_func( var_func0, cntl ) \ +{ \ + cntl->var_func = var_func0; \ +} + +#define bli_cntl_set_sub_node( sub_node0, cntl ) \ +{ \ + cntl->sub_node = sub_node0; \ +} + +#define bli_cntl_set_params( params0, cntl ) \ +{ \ + cntl->params = params0; \ +} + +#define bli_cntl_set_pack_mem( pack_mem0, cntl ) \ +{ \ + cntl->pack_mem = *(pack_mem0); \ +} + diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index d06167a07..31e995e1b 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -341,6 +341,37 @@ pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) } #endif +dim_t bli_cntx_get_num_threads( cntx_t* cntx ) +{ + return bli_cntx_jc_way( cntx ) * + bli_cntx_pc_way( cntx ) * + bli_cntx_ic_way( cntx ) * + bli_cntx_jr_way( cntx ) * + bli_cntx_ir_way( cntx ); +} + +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ) +{ + dim_t n_threads_in = 1; + + for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) ) + { + bszid_t bszid = bli_cntl_bszid( cntl ); + dim_t cur_way; + + // We assume bszid is in {KR,MR,NR,MC,KC,NR} if it is not + // BLIS_NO_PART. + if ( bszid != BLIS_NO_PART ) + cur_way = bli_cntx_way_for_bszid( bszid, cntx ); + else + cur_way = 1; + + n_threads_in *= cur_way; + } + + return n_threads_in; +} + // ----------------------------------------------------------------------------- #if 1 @@ -663,6 +694,96 @@ void bli_cntx_set_pack_schema_c( pack_t schema_c, bli_cntx_set_schema_c( schema_c, cntx ); } +void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx ) +{ + dim_t jc, pc, ic, jr, ir; + +#ifdef BLIS_ENABLE_MULTITHREADING + jc = bli_env_read_nway( "BLIS_JC_NT" ); + //pc = bli_env_read_nway( "BLIS_KC_NT" ); + pc = 1; + ic = bli_env_read_nway( "BLIS_IC_NT" ); + jr = bli_env_read_nway( "BLIS_JR_NT" ); + ir = bli_env_read_nway( "BLIS_IR_NT" ); +#else + jc = 1; + pc = 1; + ic = 1; + jr = 1; + ir = 1; +#endif + + if ( l3_op == BLIS_TRMM ) + { + // We reconfigure the paralelism from trmm_r due to a dependency in + // the jc loop. (NOTE: This dependency does not exist for trmm3 ) + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + pc, + ic, + jr * jc, + ir, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } + } + else if ( l3_op == BLIS_TRSM ) + { + if ( bli_is_right( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + jc * ic * jr, + 1, + 1, + cntx + ); + } + else // if ( bli_is_left( side ) ) + { + bli_cntx_set_thrloop + ( + 1, + 1, + 1, + ic * jr * ir, + 1, + cntx + ); + } + } + else // if ( l3_op == BLIS_TRSM ) + { + bli_cntx_set_thrloop + ( + jc, + pc, + ic, + jr, + ir, + cntx + ); + } +} + + // ----------------------------------------------------------------------------- bool_t bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, @@ -713,6 +834,36 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + return !bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); +} + +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + num_t dt = bli_obj_datatype( *obj ); + + // Reference the ukr storage preferences of the corresponding real + // micro-kernel for induced methods. + if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) + dt = bli_obj_datatype_proj_to_real( *obj ); + + const bool_t ukr_prefers_rows + = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + const bool_t ukr_prefers_cols + = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + bool_t r_val = FALSE; + + if ( bli_obj_is_row_stored( *obj ) && ukr_prefers_cols ) r_val = TRUE; + else if ( bli_obj_is_col_stored( *obj ) && ukr_prefers_rows ) r_val = TRUE; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) @@ -803,6 +954,12 @@ void bli_cntx_print( cntx_t* cntx ) ); } + { + ind_t family = bli_cntx_get_family( cntx ); + + printf( "oper family : %lu\n", ( guint_t )family ); + } + { ind_t method = bli_cntx_get_ind_method( cntx ); @@ -810,18 +967,3 @@ void bli_cntx_print( cntx_t* cntx ) } } - - - - - - - - - - - - - - - diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 5635ddc88..6aed68111 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,11 +53,15 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; pack_t schema_c; + dim_t* thrloop; + + membrk_t* membrk; } cntx_t; */ @@ -100,6 +105,10 @@ typedef struct cntx_s \ (&((cntx)->packm_ukrs) ) +#define bli_cntx_family( cntx ) \ +\ + ( (cntx)->family ) + #define bli_cntx_method( cntx ) \ \ ( (cntx)->method ) @@ -116,66 +125,120 @@ typedef struct cntx_s \ ( (cntx)->schema_c ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + +#define bli_cntx_thrloop( cntx ) \ +\ + ( (cntx)->thrloop ) + +#if 1 +#define bli_cntx_jc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NC ] ) + +#define bli_cntx_pc_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_KC ] ) + +#define bli_cntx_ic_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MC ] ) + +#define bli_cntx_jr_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_NR ] ) + +#define bli_cntx_ir_way( cntx ) \ +\ + ( (cntx)->thrloop[ BLIS_MR ] ) +#endif + +#define bli_cntx_way_for_bszid( bszid, cntx ) \ +\ + ( (cntx)->thrloop[ bszid ] ) + // cntx_t modification (fields only) #define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \ { \ - (cntx_p)->blkszs = _blkszs; \ + (cntx_p)->blkszs = _blkszs; \ } #define bli_cntx_set_bmults_buf( _bmults, cntx_p ) \ { \ - (cntx_p)->bmults = _bmults; \ + (cntx_p)->bmults = _bmults; \ } #define bli_cntx_set_l3_vir_ukrs_buf( _l3_vir_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ + (cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_buf( _l3_nat_ukrs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ + (cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \ } #define bli_cntx_set_l3_nat_ukrs_prefs_buf( _l3_nat_ukrs_prefs, cntx_p ) \ { \ - (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ + (cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \ } #define bli_cntx_set_l1f_kers_buf( _l1f_kers, cntx_p ) \ { \ - (cntx_p)->l1f_kers = _l1f_kers; \ + (cntx_p)->l1f_kers = _l1f_kers; \ } #define bli_cntx_set_l1v_kers_buf( _l1v_kers, cntx_p ) \ { \ - (cntx_p)->l1v_kers = _l1v_kers; \ + (cntx_p)->l1v_kers = _l1v_kers; \ } #define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \ { \ - (cntx_p)->packm_ukrs = _packm_ukrs; \ + (cntx_p)->packm_ukrs = _packm_ukrs; \ +} + +#define bli_cntx_set_family( _family, cntx_p ) \ +{ \ + (cntx_p)->family = _family; \ } #define bli_cntx_set_method( _method, cntx_p ) \ { \ - (cntx_p)->method = _method; \ + (cntx_p)->method = _method; \ } #define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a = _schema_a; \ } #define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b = _schema_b; \ } #define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c = _schema_c; \ +} + +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ +} + +#define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ +{ \ + (cntx_p)->thrloop[ BLIS_NC ] = jc_; \ + (cntx_p)->thrloop[ BLIS_KC ] = pc_; \ + (cntx_p)->thrloop[ BLIS_MC ] = ic_; \ + (cntx_p)->thrloop[ BLIS_NR ] = jr_; \ + (cntx_p)->thrloop[ BLIS_MR ] = ir_; \ + (cntx_p)->thrloop[ BLIS_KR ] = 1; \ } // cntx_t query (complex) @@ -252,6 +315,10 @@ typedef struct cntx_s (dt), (&(bli_cntx_l3_nat_ukrs_prefs_buf( (cntx) ))[ ukr_id ]) \ ) +#define bli_cntx_get_family( cntx ) \ +\ + bli_cntx_family( cntx ) + #define bli_cntx_get_ind_method( cntx ) \ \ bli_cntx_method( cntx ) @@ -264,6 +331,11 @@ typedef struct cntx_s \ bli_cntx_schema_b( cntx ) +#define bli_cntx_get_membrk( cntx ) \ +\ + bli_cntx_membrk( cntx ) + + // ----------------------------------------------------------------------------- @@ -326,6 +398,8 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); //pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads( cntx_t* cntx ); +dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); // set functions @@ -360,6 +434,9 @@ void bli_cntx_set_pack_schema_b( pack_t schema_b, cntx_t* cntx ); void bli_cntx_set_pack_schema_c( pack_t schema_c, cntx_t* cntx ); +void bli_cntx_set_thrloop_from_env( opid_t l3_op, + side_t side, + cntx_t* cntx ); // other query functions @@ -375,6 +452,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 6ae0f461e..7f3f897d5 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -754,6 +754,9 @@ static func_t bli_gks_l1v_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL, BLIS_CADDV_KERNEL, BLIS_DADDV_KERNEL, BLIS_ZADDV_KERNEL, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL, BLIS_CAMAXV_KERNEL, + BLIS_DAMAXV_KERNEL, BLIS_ZAMAXV_KERNEL, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL, BLIS_CAXPBYV_KERNEL, BLIS_DAXPBYV_KERNEL, BLIS_ZAXPBYV_KERNEL, } }, @@ -798,6 +801,9 @@ static func_t bli_gks_l1v_ref_kers[BLIS_NUM_LEVEL1V_KERS] = /* addv */ { { BLIS_SADDV_KERNEL_REF, BLIS_CADDV_KERNEL_REF, BLIS_DADDV_KERNEL_REF, BLIS_ZADDV_KERNEL_REF, } }, +/* amaxv */ { { BLIS_SAMAXV_KERNEL_REF, BLIS_CAMAXV_KERNEL_REF, + BLIS_DAMAXV_KERNEL_REF, BLIS_ZAMAXV_KERNEL_REF, } + }, /* axpbyv */ { { BLIS_SAXPBYV_KERNEL_REF, BLIS_CAXPBYV_KERNEL_REF, BLIS_DAXPBYV_KERNEL_REF, BLIS_ZAXPBYV_KERNEL_REF, } }, diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fede4f823..4c63b604d 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -114,9 +114,9 @@ char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) // -- Memory pool-related ------------------------------------------------------ -gint_t bli_info_get_mk_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_A_BLOCK ); } -gint_t bli_info_get_kn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_B_PANEL ); } -gint_t bli_info_get_mn_pool_size( void ) { return bli_mem_pool_size( BLIS_BUFFER_FOR_C_PANEL ); } +gint_t bli_info_get_mk_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_A_BLOCK ); } +gint_t bli_info_get_kn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_B_PANEL ); } +gint_t bli_info_get_mn_pool_size( void ) { return bli_membrk_pool_size( bli_memsys_global_membrk(), BLIS_BUFFER_FOR_C_PANEL ); } diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index 6e793fa40..db598cede 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -83,17 +83,14 @@ err_t bli_init( void ) { // Initialize various sub-APIs. bli_const_init(); - bli_cntl_init(); bli_error_init(); - bli_mem_init(); + bli_memsys_init(); bli_ind_init(); bli_thread_init(); // After initialization is complete, mark BLIS as initialized. bli_is_init = TRUE; - //bli_mem_init(); - // Only the thread that actually performs the initialization will // return "success". r_val = BLIS_SUCCESS; @@ -150,9 +147,8 @@ err_t bli_finalize( void ) { // Finalize various sub-APIs. bli_const_finalize(); - bli_cntl_finalize(); bli_error_finalize(); - bli_mem_finalize(); + bli_memsys_finalize(); bli_ind_finalize(); bli_thread_finalize(); diff --git a/frame/base/bli_malloc.c b/frame/base/bli_malloc.c index 191db4834..3a36378ae 100644 --- a/frame/base/bli_malloc.c +++ b/frame/base/bli_malloc.c @@ -145,6 +145,10 @@ void bli_free_align int8_t* p_byte; void** p_addr; + // If the pointer to free is NULL, it was obviously not aligned and + // does not need to be freed. + if ( p == NULL ) return; + // Since the bli_malloc_pool() function returned the aligned pointer, // we have to first recover the original pointer before we can free // the memory. diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index 8d6d71501..82bd88afb 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,37 +33,94 @@ */ -void bli_mem_init( void ); -void bli_mem_reinit( cntx_t* cntx ); -void bli_mem_finalize( void ); -bool_t bli_mem_is_initialized( void ); +#ifndef BLIS_MEM_H +#define BLIS_MEM_H -// ----------------------------------------------------------------------------- -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ); +// Mem entry query -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ); +#define bli_mem_pblk( mem_p ) \ +\ + ( &((mem_p)->pblk) ) -void bli_mem_release( mem_t* mem ); +#define bli_mem_buffer( mem_p ) \ +\ + ( bli_pblk_buf_align( bli_mem_pblk( mem_p ) ) ) -siz_t bli_mem_pool_size( packbuf_t buf_type ); +#define bli_mem_buf_sys( mem_p ) \ +\ + ( bli_pblk_buf_sys( bli_mem_pblk( mem_p ) ) ) -// ----------------------------------------------------------------------------- +#define bli_mem_buf_type( mem_p ) \ +\ + ( (mem_p)->buf_type ) -void bli_mem_init_pools( cntx_t* cntx ); -void bli_mem_reinit_pools( cntx_t* cntx ); -void bli_mem_finalize_pools( void ); +#define bli_mem_pool( mem_p ) \ +\ + ( (mem_p)->pool ) -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ); +#define bli_mem_membrk( mem_p ) \ +\ + ( (mem_p)->membrk ) +#define bli_mem_size( mem_p ) \ +\ + ( (mem_p)->size ) + +#define bli_mem_is_alloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) != NULL ) + +#define bli_mem_is_unalloc( mem_p ) \ +\ + ( bli_mem_buffer( mem_p ) == NULL ) + + +// Mem entry modification + +#define bli_mem_set_pblk( pblk_p, mem_p ) \ +{ \ + mem_p->pblk = *(pblk_p); \ +} + +#define bli_mem_set_buffer( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_align( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_sys( buf0, mem_p ) \ +{ \ + bli_pblk_set_buf_sys( buf0, &(mem_p->pblk) ); \ +} + +#define bli_mem_set_buf_type( buf_type0, mem_p ) \ +{ \ + (mem_p)->buf_type = buf_type0; \ +} + +#define bli_mem_set_pool( pool0, mem_p ) \ +{ \ + (mem_p)->pool = pool0; \ +} + +#define bli_mem_set_membrk( membrk0, mem_p ) \ +{ \ + (mem_p)->membrk = membrk0; \ +} + +#define bli_mem_set_size( size0, mem_p ) \ +{ \ + mem_p->size = size0; \ +} + +#define bli_mem_clear( mem_p ) \ +{ \ + bli_mem_set_buffer( NULL, mem_p ); \ + bli_mem_set_buf_sys( NULL, mem_p ); \ + bli_mem_set_pool( NULL, mem_p ); \ + bli_mem_set_size( 0, mem_p ); \ + bli_mem_set_membrk( NULL, mem_p ); \ +} + + +#endif diff --git a/frame/base/bli_mem.c b/frame/base/bli_membrk.c similarity index 69% rename from frame/base/bli_mem.c rename to frame/base/bli_membrk.c index 25530b1ed..33a998de1 100644 --- a/frame/base/bli_mem.c +++ b/frame/base/bli_membrk.c @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -34,21 +35,34 @@ #include "blis.h" -#ifdef BLIS_ENABLE_PTHREADS -pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ) +{ + bli_mutex_init( bli_membrk_mutex( membrk ) ); + bli_membrk_init_pools( cntx, membrk ); + bli_membrk_set_malloc_fp( bli_malloc_pool, membrk ); +} -// Declare one memory pool structure for each block size/shape we want to -// be able to allocate. -static pool_t pools[3]; +void bli_membrk_finalize + ( + membrk_t* membrk + ) +{ + bli_membrk_set_malloc_fp( NULL, membrk ); + bli_membrk_finalize_pools( membrk ); + bli_mutex_finalize( bli_membrk_mutex( membrk ) ); +} - - -// ----------------------------------------------------------------------------- - -void bli_mem_acquire_m( siz_t req_size, - packbuf_t buf_type, - mem_t* mem ) +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ) { pool_t* pool; pblk_t* pblk; @@ -56,24 +70,28 @@ void bli_mem_acquire_m( siz_t req_size, siz_t block_size; // Make sure the API is initialized. - bli_mem_init(); + //assert( membrk ); //?? if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { // For general-use buffer requests, such as those used by level-2 // operations, dynamically allocating memory is sufficient. - void* buf_sys = bli_malloc_pool( req_size ); + // Note that we use the malloc()-style memory allocation function + // that is stored in the membrk_t object. + void* buf_sys = bli_membrk_malloc( req_size, membrk ); // Initialize the mem_t object with: // - the address of the memory block, - // - the buffer type (a packbuf_t value), and - // - the size of the requested region. + // - the buffer type (a packbuf_t value), + // - the size of the requested region, + // - the membrk_t from which the mem_t entry was acquired. // NOTE: We do not initialize the pool field since this block did not // come from a memory pool. bli_mem_set_buffer( buf_sys, mem ); bli_mem_set_buf_sys( buf_sys, mem ); bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_size( req_size, mem ); + bli_mem_set_membrk( membrk, mem ); } else { @@ -84,7 +102,7 @@ void bli_mem_acquire_m( siz_t req_size, // Map the requested packed buffer type to a zero-based index, which // we then use to select the corresponding memory pool. pi = bli_packbuf_index( buf_type ); - pool = &pools[ pi ]; + pool = bli_membrk_pool( pi, membrk ); // Unconditionally perform error checking on the memory pool. { @@ -100,13 +118,8 @@ void bli_mem_acquire_m( siz_t req_size, // Extract the address of the pblk_t struct within the mem_t. pblk = bli_mem_pblk( mem ); -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); { // Checkout a block from the pool. If the pool is exhausted, @@ -125,36 +138,39 @@ void bli_mem_acquire_m( siz_t req_size, block_size = bli_pool_block_size( pool ); } + bli_membrk_unlock( membrk ); // END CRITICAL SECTION -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - // Initialize the mem_t object with: // - the buffer type (a packbuf_t value), - // - the address of the memory pool to which it belongs, and + // - the address of the memory pool to which it belongs, // - the size of the contiguous memory block (NOT the size of the - // requested region). + // requested region), + // - the membrk_t from which the mem_t entry was acquired. // The actual addresses (system and aligned) are already stored in // the mem_t struct's pblk_t field bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( pool, mem ); bli_mem_set_size( block_size, mem ); + bli_mem_set_membrk( membrk, mem ); } } -void bli_mem_release( mem_t* mem ) +void bli_membrk_release + ( + mem_t* mem + ) { packbuf_t buf_type; pool_t* pool; pblk_t* pblk; siz_t block_size_cur; siz_t block_size_prev; + membrk_t* membrk; - // Make sure the API is initialized. - bli_mem_init(); + // Extract the membrk_t address from the mem_t object. + membrk = bli_mem_membrk( mem ); // Extract the buffer type so we know what kind of memory was allocated. buf_type = bli_mem_buf_type( mem ); @@ -165,7 +181,9 @@ void bli_mem_release( mem_t* mem ) // For general-use buffers, we dynamically allocate memory, and so // here we need to free. - bli_free_pool( buf_sys ); + // Note that we use the free()-style memory release function that + // is stored in the membrk_t object. + bli_membrk_free( buf_sys, membrk ); } else { @@ -181,14 +199,8 @@ void bli_mem_release( mem_t* mem ) // section.) block_size_prev = bli_mem_size( mem ); -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - // BEGIN CRITICAL SECTION + bli_membrk_lock( membrk ); { // Query the size of the blocks currently in the pool. @@ -213,33 +225,40 @@ void bli_mem_release( mem_t* mem ) } } + bli_membrk_unlock( membrk ); // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif } // Clear the mem_t object so that it appears unallocated. This clears: // - the pblk_t struct's fields (ie: the buffer addresses) // - the pool field // - the size field + // - the membrk field // NOTE: We do not clear the buf_type field since there is no // "uninitialized" value for packbuf_t. bli_mem_clear( mem ); } -void bli_mem_acquire_v( siz_t req_size, - mem_t* mem ) +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ) { - bli_mem_acquire_m( req_size, - BLIS_BUFFER_FOR_GEN_USE, - mem ); + bli_membrk_acquire_m( membrk, + req_size, + BLIS_BUFFER_FOR_GEN_USE, + mem ); } -siz_t bli_mem_pool_size( packbuf_t buf_type ) +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ) { siz_t r_val; @@ -251,15 +270,15 @@ siz_t bli_mem_pool_size( packbuf_t buf_type ) } else { - dim_t index; + dim_t pool_index; pool_t* pool; // Acquire the pointer to the pool corresponding to the buf_type // provided. - index = bli_packbuf_index( buf_type ); - pool = &(pools[index]); + pool_index = bli_packbuf_index( buf_type ); + pool = bli_membrk_pool( pool_index, membrk ); - // Compute the pool "size" as the product of the block size + // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. r_val = bli_pool_block_size( pool ) * bli_pool_num_blocks( pool ); @@ -270,131 +289,11 @@ siz_t bli_mem_pool_size( packbuf_t buf_type ) // ----------------------------------------------------------------------------- -static bool_t bli_mem_is_init = FALSE; - -void bli_mem_init( void ) -{ - cntx_t cntx; - - // If the initialization flag is TRUE, we know the API is already - // initialized, so we can return early. - if ( bli_mem_is_init == TRUE ) return; - - // Create and initialize a context for gemm so we have something - // to pass into bli_mem_init_pools(). - bli_gemm_cntx_init( &cntx ); - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // initialization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == FALSE ) - { - // Initialize the memory pools. - bli_mem_init_pools( &cntx ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif - - // Finalize the temporary gemm context. - bli_gemm_cntx_finalize( &cntx ); -} - -void bli_mem_reinit( cntx_t* cntx ) -{ -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // If for some reason the memory pools have not yet been - // initialized (unlikely), we emulate the body of bli_mem_init(). - if ( bli_mem_is_init == FALSE ) - { - // Initialize the memory pools. - bli_mem_init_pools( cntx ); - - // After initialization, mark the API as initialized. - bli_mem_is_init = TRUE; - } - else - { - // Reinitialize the memory pools. - bli_mem_reinit_pools( cntx ); - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -void bli_mem_finalize( void ) -{ - // If the initialization flag is FALSE, we know the API is already - // uninitialized, so we can return early. - if ( bli_mem_is_init == FALSE ) return; - -#ifdef BLIS_ENABLE_OPENMP - _Pragma( "omp critical (mem)" ) -#endif -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_lock( &mem_manager_mutex ); -#endif - - // BEGIN CRITICAL SECTION - { - // Here, we test the initialization flag again. NOTE: THIS IS NOT - // REDUNDANT. This additional test is needed so that other threads - // that may be waiting to acquire the lock do not perform any - // finalization actions once they are finally allowed into this - // critical section. - if ( bli_mem_is_init == TRUE ) - { - // Finalize the memory pools. - bli_mem_finalize_pools(); - - // After finalization, mark the API as uninitialized. - bli_mem_is_init = FALSE; - } - } - // END CRITICAL SECTION - -#ifdef BLIS_ENABLE_PTHREADS - pthread_mutex_unlock( &mem_manager_mutex ); -#endif -} - -bool_t bli_mem_is_initialized( void ) -{ - return bli_mem_is_init; -} - -// ----------------------------------------------------------------------------- - -void bli_mem_init_pools( cntx_t* cntx ) +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) { // Map each of the packbuf_t values to an index starting at zero. const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); @@ -404,9 +303,9 @@ void bli_mem_init_pools( cntx_t* cntx ) const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); // Start with empty pools. const dim_t num_blocks_a = 0; @@ -418,10 +317,10 @@ void bli_mem_init_pools( cntx_t* cntx ) siz_t block_size_c = 0; // Determine the block size for each memory pool. - bli_mem_compute_pool_block_sizes( &block_size_a, - &block_size_b, - &block_size_c, - cntx ); + bli_membrk_compute_pool_block_sizes( &block_size_a, + &block_size_b, + &block_size_c, + cntx ); // Initialize the memory pools for A, B, and C. bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a ); @@ -429,7 +328,11 @@ void bli_mem_init_pools( cntx_t* cntx ) bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c ); } -void bli_mem_reinit_pools( cntx_t* cntx ) +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ) { // Map each of the packbuf_t values to an index starting at zero. const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); @@ -439,9 +342,9 @@ void bli_mem_reinit_pools( cntx_t* cntx ) const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); // Query the number of blocks currently allocated in each pool. const dim_t num_blocks_a = bli_pool_num_blocks( pool_a ); @@ -453,10 +356,10 @@ void bli_mem_reinit_pools( cntx_t* cntx ) siz_t block_size_c_new = 0; // Determine the context-implied block size needed for each pool. - bli_mem_compute_pool_block_sizes( &block_size_a_new, - &block_size_b_new, - &block_size_c_new, - cntx ); + bli_membrk_compute_pool_block_sizes( &block_size_a_new, + &block_size_b_new, + &block_size_c_new, + cntx ); // Reinitialize the pool, but only if one of the parameters has // changed in such a way that reinitialization would be required. @@ -468,7 +371,10 @@ void bli_mem_reinit_pools( cntx_t* cntx ) bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c ); } -void bli_mem_finalize_pools( void ) +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ) { // Map each of the packbuf_t values to an index starting at zero. dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); @@ -476,9 +382,9 @@ void bli_mem_finalize_pools( void ) dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); // Alias the pool addresses to convenient identifiers. - pool_t* pool_a = &pools[ index_a ]; - pool_t* pool_b = &pools[ index_b ]; - pool_t* pool_c = &pools[ index_c ]; + pool_t* pool_a = bli_membrk_pool( index_a, membrk ); + pool_t* pool_b = bli_membrk_pool( index_b, membrk ); + pool_t* pool_c = bli_membrk_pool( index_c, membrk ); // Finalize the memory pools for A, B, and C. bli_pool_finalize( pool_a ); @@ -488,10 +394,13 @@ void bli_mem_finalize_pools( void ) // ----------------------------------------------------------------------------- -void bli_mem_compute_pool_block_sizes( siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) { const ind_t im = bli_cntx_get_ind_method( cntx ); @@ -513,11 +422,11 @@ void bli_mem_compute_pool_block_sizes( siz_t* bs_a, // Avoid considering induced methods for real datatypes. if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; - bli_mem_compute_pool_block_sizes_dt( dt, - &bs_dt_a, - &bs_dt_b, - &bs_dt_c, - cntx ); + bli_membrk_compute_pool_block_sizes_dt( dt, + &bs_dt_a, + &bs_dt_b, + &bs_dt_c, + cntx ); bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); @@ -532,11 +441,14 @@ void bli_mem_compute_pool_block_sizes( siz_t* bs_a, // ----------------------------------------------------------------------------- -void bli_mem_compute_pool_block_sizes_dt( num_t dt, - siz_t* bs_a, - siz_t* bs_b, - siz_t* bs_c, - cntx_t* cntx ) +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ) { siz_t size_dt = bli_datatype_size( dt ); diff --git a/frame/base/bli_membrk.h b/frame/base/bli_membrk.h new file mode 100644 index 000000000..cce0f4c1a --- /dev/null +++ b/frame/base/bli_membrk.h @@ -0,0 +1,159 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MEMBRK_H +#define BLIS_MEMBRK_H + + +#define bli_membrk_pool( pool_index, membrk_p ) \ +\ + ( (membrk_p)->pools + (pool_index) ) + +#define bli_membrk_mutex( membrk_p ) \ +\ + ( &( (membrk_p)->mutex ) ) + +#define bli_membrk_malloc_fp( membrk_p ) \ +\ + ( (membrk_p)->malloc_fp ) + +#define bli_membrk_free_fp( membrk_p ) \ +\ + ( (membrk_p)->free_fp ) + +#define bli_membrk_set_malloc_fp( _malloc_fp, membrk_p ) \ +{\ + (membrk_p)->malloc_fp = _malloc_fp; \ +} + +#define bli_membrk_set_free_fp( _free_fp, membrk_p ) \ +{\ + (membrk_p)->free_fp = _free_fp; \ +} + +#define bli_membrk_lock( membrk_p ) \ +{\ + bli_mutex_lock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_unlock( membrk_p ) \ +{\ + bli_mutex_unlock( &((membrk_p)->mutex) ); \ +} + +#define bli_membrk_malloc( size, membrk ) \ +\ + /* Call the malloc()-style function in membrk. */ \ + ((membrk)->malloc_fp)( size ) + +#define bli_membrk_free( buf_p, membrk ) \ +\ + /* Call the free()-style function in membrk. */ \ + ((membrk)->free_fp)( buf_p ) + + +// ----------------------------------------------------------------------------- + +void bli_membrk_init + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize + ( + membrk_t* membrk + ); + +void bli_membrk_acquire_m + ( + membrk_t* membrk, + siz_t req_size, + packbuf_t buf_type, + mem_t* mem + ); + +void bli_membrk_acquire_v + ( + membrk_t* membrk, + siz_t req_size, + mem_t* mem + ); + +void bli_membrk_release + ( + mem_t* mem + ); + +siz_t bli_membrk_pool_size + ( + membrk_t* membrk, + packbuf_t buf_type + ); + +// ---------------------------------------------------------------------------- + +void bli_membrk_init_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_reinit_pools + ( + cntx_t* cntx, + membrk_t* membrk + ); +void bli_membrk_finalize_pools + ( + membrk_t* membrk + ); + +void bli_membrk_compute_pool_block_sizes + ( + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); +void bli_membrk_compute_pool_block_sizes_dt + ( + num_t dt, + siz_t* bs_a, + siz_t* bs_b, + siz_t* bs_c, + cntx_t* cntx + ); + +#endif + diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c new file mode 100644 index 000000000..e66aafa63 --- /dev/null +++ b/frame/base/bli_memsys.c @@ -0,0 +1,174 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_PTHREADS +pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +static membrk_t global_membrk; + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ) +{ + return &global_membrk; +} + +// ----------------------------------------------------------------------------- + +static bool_t bli_memsys_is_init = FALSE; + +void bli_memsys_init( void ) +{ + cntx_t cntx; + + // If the initialization flag is TRUE, we know the API is already + // initialized, so we can return early. + if ( bli_memsys_is_init == TRUE ) return; + + // Create and initialize a context for gemm so we have something + // to pass into bli_membrk_init_pools(). + bli_gemm_cntx_init( &cntx ); + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // initialization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( &cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif + + // Finalize the temporary gemm context. + bli_gemm_cntx_finalize( &cntx ); +} + +void bli_memsys_reinit( cntx_t* cntx ) +{ +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // If for some reason the memory pools have not yet been + // initialized (unlikely), we emulate the body of bli_memsys_init(). + if ( bli_memsys_is_init == FALSE ) + { + // Initialize the global membrk_t object and its memory pools. + bli_membrk_init( cntx, &global_membrk ); + + // After initialization, mark the API as initialized. + bli_memsys_is_init = TRUE; + } + else + { + // Reinitialize the global membrk_t object's memory pools. + bli_membrk_reinit_pools( cntx, &global_membrk ); + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +void bli_memsys_finalize( void ) +{ + // If the initialization flag is FALSE, we know the API is already + // uninitialized, so we can return early. + if ( bli_memsys_is_init == FALSE ) return; + +#ifdef BLIS_ENABLE_OPENMP + _Pragma( "omp critical (mem)" ) +#endif +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_lock( &mem_manager_mutex ); +#endif + + // BEGIN CRITICAL SECTION + { + // Here, we test the initialization flag again. NOTE: THIS IS NOT + // REDUNDANT. This additional test is needed so that other threads + // that may be waiting to acquire the lock do not perform any + // finalization actions once they are finally allowed into this + // critical section. + if ( bli_memsys_is_init == TRUE ) + { + // Finalize the global membrk_t object and its memory pools. + bli_membrk_finalize( &global_membrk ); + + // After finalization, mark the API as uninitialized. + bli_memsys_is_init = FALSE; + } + } + // END CRITICAL SECTION + +#ifdef BLIS_ENABLE_PTHREADS + pthread_mutex_unlock( &mem_manager_mutex ); +#endif +} + +bool_t bli_memsys_is_initialized( void ) +{ + return bli_memsys_is_init; +} + diff --git a/frame/3/herk/old/bli_herk_blk_var3f.h b/frame/base/bli_memsys.h similarity index 77% rename from frame/3/herk/old/bli_herk_blk_var3f.h rename to frame/base/bli_memsys.h index 800a44b8d..0a7b142a7 100644 --- a/frame/3/herk/old/bli_herk_blk_var3f.h +++ b/frame/base/bli_memsys.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,20 @@ */ -void bli_herk_blk_var3f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#ifndef BLIS_MEMSYS_H +#define BLIS_MEMSYS_H + +// ----------------------------------------------------------------------------- + +membrk_t* bli_memsys_global_membrk( void ); + +// ----------------------------------------------------------------------------- + +void bli_memsys_init( void ); +void bli_memsys_reinit( cntx_t* cntx ); +void bli_memsys_finalize( void ); +bool_t bli_memsys_is_initialized( void ); + + +#endif diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c index 226b0747a..e1f05d075 100644 --- a/frame/base/bli_obj.c +++ b/frame/base/bli_obj.c @@ -65,7 +65,6 @@ void bli_obj_create_without_buffer( num_t dt, obj_t* obj ) { siz_t elem_size; - mem_t* pack_mem; void* s; if ( bli_error_checking_is_enabled() ) @@ -98,9 +97,6 @@ void bli_obj_create_without_buffer( num_t dt, bli_obj_set_offs( 0, 0, *obj ); bli_obj_set_diag_offset( 0, *obj ); - pack_mem = bli_obj_pack_mem( *obj ); - bli_mem_set_buffer( NULL, pack_mem ); - // Set the internal scalar to 1.0. s = bli_obj_internal_scalar_buffer( *obj ); @@ -467,8 +463,6 @@ num_t bli_datatype_union( num_t dt1, num_t dt2 ) void bli_obj_print( char* label, obj_t* obj ) { FILE* file = stdout; - mem_t* pack_mem = bli_obj_pack_mem( *obj ); - //mem_t* cast_mem = bli_obj_cast_mem( *obj ); if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); @@ -491,10 +485,6 @@ void bli_obj_print( char* label, obj_t* obj ) fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( *obj ), ( signed long int )bli_obj_col_stride( *obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( *obj ) ); - fprintf( file, " pack_mem \n" ); - fprintf( file, " - buf %p\n", ( void* )bli_mem_buffer( pack_mem ) ); - fprintf( file, " - buf_type %lu\n", ( unsigned long int )bli_mem_buf_type( pack_mem ) ); - fprintf( file, " - size %lu\n", ( unsigned long int )bli_mem_size( pack_mem ) ); fprintf( file, " m_padded %lu\n", ( unsigned long int )bli_obj_padded_length( *obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long int )bli_obj_padded_width( *obj ) ); fprintf( file, " ps %lu\n", ( unsigned long int )bli_obj_panel_stride( *obj ) ); diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c index 64718353e..738284064 100644 --- a/frame/base/bli_part.c +++ b/frame/base/bli_part.c @@ -38,11 +38,31 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_mdim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_t2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -59,14 +79,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_t2b_check( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b_check( req_part, i, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -90,7 +110,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is i x n. @@ -99,7 +119,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i; n_part = n; } - else if ( requested_part == BLIS_SUBPART1T ) + else if ( req_part == BLIS_SUBPART1T ) { // A1T (offm,offn) unchanged. // A1T is (i+b) x n. @@ -108,7 +128,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = i + b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (i,0). // A1 is b x n. @@ -117,7 +137,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = b; n_part = n; } - else if ( requested_part == BLIS_SUBPART1B ) + else if ( req_part == BLIS_SUBPART1B ) { // A1B (offm,offn) += (i,0). // A1B is (m-i) x n. @@ -126,7 +146,7 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, m_part = m - i; n_part = n; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (i+b,0). // A2 is (m-i-b) x n. @@ -208,11 +228,14 @@ void bli_acquire_mpart_t2b( subpart_t requested_part, } -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_b2t + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; @@ -222,15 +245,35 @@ void bli_acquire_mpart_b2t( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. i = m - i - b; - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); } -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_ndim + ( + dir_t direct, + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( direct == BLIS_FWD ) + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); + else + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); +} + + +void bli_acquire_mpart_l2r + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -247,14 +290,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r_check( req_part, j, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -278,7 +321,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. - if ( requested_part == BLIS_SUBPART0 ) + if ( req_part == BLIS_SUBPART0 ) { // A0 (offm,offn) unchanged. // A0 is m x j. @@ -287,7 +330,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j; } - else if ( requested_part == BLIS_SUBPART1L ) + else if ( req_part == BLIS_SUBPART1L ) { // A1L (offm,offn) unchanged. // A1L is m x (j+b). @@ -296,7 +339,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = j + b; } - else if ( requested_part == BLIS_SUBPART1 ) + else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (0,j). // A1 is m x b. @@ -305,7 +348,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = b; } - else if ( requested_part == BLIS_SUBPART1R ) + else if ( req_part == BLIS_SUBPART1R ) { // A1R (offm,offn) += (0,j). // A1R is m x (n-j). @@ -314,7 +357,7 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, m_part = m; n_part = n - j; } - else // if ( requested_part == BLIS_SUBPART2 ) + else // if ( req_part == BLIS_SUBPART2 ) { // A2 (offm,offn) += (0,j+b). // A2 is m x (n-j-b). @@ -395,11 +438,14 @@ void bli_acquire_mpart_l2r( subpart_t requested_part, } -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_r2l + ( + subpart_t req_part, + dim_t j, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t n; @@ -409,15 +455,18 @@ void bli_acquire_mpart_r2l( subpart_t requested_part, // Modify i to account for the fact that we are moving backwards. j = n - j - b; - bli_acquire_mpart_l2r( requested_part, j, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); } -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_tl2br + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { dim_t m; dim_t n; @@ -435,14 +484,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // partitioned through normally.) if ( bli_obj_is_panel_packed( *obj ) ) { - bli_packm_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) - bli_acquire_mpart_tl2br_check( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br_check( req_part, ij, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for @@ -469,7 +518,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // subpartition is being requested, assuming no transposition. // Left column of subpartitions - if ( requested_part == BLIS_SUBPART00 ) + if ( req_part == BLIS_SUBPART00 ) { // A00 (offm,offn) unchanged. // A00 is ij x ij. @@ -478,7 +527,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = ij; } - else if ( requested_part == BLIS_SUBPART10 ) + else if ( req_part == BLIS_SUBPART10 ) { // A10 (offm,offn) += (ij,0). // A10 is b x ij. @@ -487,7 +536,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = ij; } - else if ( requested_part == BLIS_SUBPART20 ) + else if ( req_part == BLIS_SUBPART20 ) { // A20 (offm,offn) += (ij+b,0). // A20 is (m-ij-b) x ij. @@ -498,7 +547,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Middle column of subpartitions. - else if ( requested_part == BLIS_SUBPART01 ) + else if ( req_part == BLIS_SUBPART01 ) { // A01 (offm,offn) += (0,ij). // A01 is ij x b. @@ -507,7 +556,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = b; } - else if ( requested_part == BLIS_SUBPART11 ) + else if ( req_part == BLIS_SUBPART11 ) { // A11 (offm,offn) += (ij,ij). // A11 is b x b. @@ -516,7 +565,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = b; } - else if ( requested_part == BLIS_SUBPART21 ) + else if ( req_part == BLIS_SUBPART21 ) { // A21 (offm,offn) += (ij+b,ij). // A21 is (m-ij-b) x b. @@ -527,7 +576,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } // Right column of subpartitions. - else if ( requested_part == BLIS_SUBPART02 ) + else if ( req_part == BLIS_SUBPART02 ) { // A02 (offm,offn) += (0,ij+b). // A02 is ij x (n-ij-b). @@ -536,7 +585,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = ij; n_part = n - ij - b; } - else if ( requested_part == BLIS_SUBPART12 ) + else if ( req_part == BLIS_SUBPART12 ) { // A12 (offm,offn) += (ij,ij+b). // A12 is b x (n-ij-b). @@ -545,7 +594,7 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, m_part = b; n_part = n - ij - b; } - else // if ( requested_part == BLIS_SUBPART22 ) + else // if ( req_part == BLIS_SUBPART22 ) { // A22 (offm,offn) += (ij+b,ij+b). // A22 is (m-ij-b) x (n-ij-b). @@ -588,9 +637,9 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, // we let the subpartition inherit the storage structure of its immediate // parent. if ( !bli_obj_root_is_general( *sub_obj ) && - requested_part != BLIS_SUBPART00 && - requested_part != BLIS_SUBPART11 && - requested_part != BLIS_SUBPART22 ) + req_part != BLIS_SUBPART00 && + req_part != BLIS_SUBPART11 && + req_part != BLIS_SUBPART22 ) { // FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal // intersecting subpartitions should inherit their root object's @@ -638,11 +687,14 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part, } -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_mpart_br2tl + ( + subpart_t req_part, + dim_t ij, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { // Query the dimension of the object. dim_t mn = bli_obj_length( *obj ); @@ -650,35 +702,73 @@ void bli_acquire_mpart_br2tl( subpart_t requested_part, // Modify ij to account for the fact that we are moving backwards. ij = mn - ij - b; - bli_acquire_mpart_tl2br( requested_part, ij, b, obj, sub_obj ); + bli_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); } // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_f2b + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_t2b( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_l2r( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_l2r( req_part, i, b, obj, sub_obj ); } -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ) +void bli_acquire_vpart_b2f + ( + subpart_t req_part, + dim_t i, + dim_t b, + obj_t* obj, + obj_t* sub_obj + ) { if ( bli_obj_is_col_vector( *obj ) ) - bli_acquire_mpart_b2t( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_b2t( req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( *obj ) ) - bli_acquire_mpart_r2l( requested_part, i, b, obj, sub_obj ); + bli_acquire_mpart_r2l( req_part, i, b, obj, sub_obj ); +} + + +// -- Scalar acquisition ------------------------------------------------------- + + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ) +{ + obj_t tmp_obj; + + bli_acquire_mpart_l2r( BLIS_SUBPART1, j, 1, obj, &tmp_obj ); + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, &tmp_obj, sub_obj ); +} + + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ) +{ + if ( bli_obj_is_col_vector( *obj ) ) + bli_acquire_mpart_t2b( BLIS_SUBPART1, i, 1, obj, sub_obj ); + else // if ( bli_obj_is_row_vector( *obj ) ) + bli_acquire_mpart_l2r( BLIS_SUBPART1, i, 1, obj, sub_obj ); } diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h index ed1fa0d15..fd24f1d82 100644 --- a/frame/base/bli_part.h +++ b/frame/base/bli_part.h @@ -36,50 +36,60 @@ // -- Matrix partitioning ------------------------------------------------------ -void bli_acquire_mpart_t2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_b2t( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); -void bli_acquire_mpart_l2r( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_r2l( subpart_t requested_part, - dim_t j, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_mpart_mdim ) +GENPROT( acquire_mpart_ndim ) -void bli_acquire_mpart_tl2br( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); -void bli_acquire_mpart_br2tl( subpart_t requested_part, - dim_t ij, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +void PASTEMAC0( opname ) \ + ( \ + subpart_t req_part, \ + dim_t i, \ + dim_t b, \ + obj_t* obj, \ + obj_t* sub_obj \ + ); + +GENPROT( acquire_mpart_t2b ) +GENPROT( acquire_mpart_b2t ) +GENPROT( acquire_mpart_l2r ) +GENPROT( acquire_mpart_r2l ) +GENPROT( acquire_mpart_tl2br ) +GENPROT( acquire_mpart_br2tl ) // -- Vector partitioning ------------------------------------------------------ -void bli_acquire_vpart_f2b( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +GENPROT( acquire_vpart_f2b ) +GENPROT( acquire_vpart_b2f ) -void bli_acquire_vpart_b2f( subpart_t requested_part, - dim_t i, - dim_t b, - obj_t* obj, - obj_t* sub_obj ); +// -- Scalar acquisition ------------------------------------------------------- + +void bli_acquire_mij + ( + dim_t i, + dim_t j, + obj_t* obj, + obj_t* sub_obj + ); + +void bli_acquire_vi + ( + dim_t i, + obj_t* obj, + obj_t* sub_obj + ); diff --git a/frame/base/bli_mem.c.prev b/frame/base/old/bli_mem.c.prev similarity index 100% rename from frame/base/bli_mem.c.prev rename to frame/base/old/bli_mem.c.prev diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c index 24aa192e3..1b63e0b7e 100644 --- a/frame/compat/bla_amax.c +++ b/frame/compat/bla_amax.c @@ -80,7 +80,8 @@ f77_int PASTEF772(i,chx,blasname) \ ); \ \ /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) - index. */ \ + index. Also, if the BLAS integer size differs from the BLIS + integer size, that typecast occurs here. */ \ f77_index = bli_index + 1; \ \ /* Finalize BLIS (if it was initialized above). */ \ diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index 577a4f5f8..e66851194 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -82,6 +82,22 @@ // Default behavior is disabled. #endif +// Perform a sanity check to make sure the user doesn't try to enable +// both OpenMP and pthreads. +#if defined ( BLIS_ENABLE_OPENMP ) && \ + defined ( BLIS_ENABLE_PTHREADS ) + #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." +#endif + +// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP +// or pthreads are enabled. This macro is useful in situations when +// we want to detect use of either OpenMP or pthreads (as opposed +// to neither being used). +#if defined ( BLIS_ENABLE_OPENMP ) || \ + defined ( BLIS_ENABLE_PTHREADS ) + #define BLIS_ENABLE_MULTITHREADING +#endif + // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h index 9ac03de97..a50968845 100644 --- a/frame/include/bli_extern_defs.h +++ b/frame/include/bli_extern_defs.h @@ -46,6 +46,5 @@ extern obj_t BLIS_MINUS_TWO; extern thrcomm_t BLIS_SINGLE_COMM; extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; -extern thrinfo_t BLIS_HERK_SINGLE_THREADED; #endif diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 305253241..365247aee 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -723,6 +723,24 @@ // Level-1v // +// amaxv kernels + +#ifndef BLIS_SAMAXV_KERNEL +#define BLIS_SAMAXV_KERNEL BLIS_SAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_DAMAXV_KERNEL +#define BLIS_DAMAXV_KERNEL BLIS_DAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_CAMAXV_KERNEL +#define BLIS_CAMAXV_KERNEL BLIS_CAMAXV_KERNEL_REF +#endif + +#ifndef BLIS_ZAMAXV_KERNEL +#define BLIS_ZAMAXV_KERNEL BLIS_ZAMAXV_KERNEL_REF +#endif + // addv kernels #ifndef BLIS_SADDV_KERNEL diff --git a/frame/include/bli_kernel_pre_macro_defs.h b/frame/include/bli_kernel_pre_macro_defs.h index 81f1deb98..a65fc435d 100644 --- a/frame/include/bli_kernel_pre_macro_defs.h +++ b/frame/include/bli_kernel_pre_macro_defs.h @@ -267,6 +267,13 @@ #define BLIS_CADDV_KERNEL_REF bli_caddv_ref #define BLIS_ZADDV_KERNEL_REF bli_zaddv_ref +// amaxv kernels + +#define BLIS_SAMAXV_KERNEL_REF bli_samaxv_ref +#define BLIS_DAMAXV_KERNEL_REF bli_damaxv_ref +#define BLIS_CAMAXV_KERNEL_REF bli_camaxv_ref +#define BLIS_ZAMAXV_KERNEL_REF bli_zamaxv_ref + // axpbyv kernels #define BLIS_SAXPBYV_KERNEL_REF bli_saxpbyv_ref diff --git a/frame/include/bli_kernel_prototypes.h b/frame/include/bli_kernel_prototypes.h index d05505305..7894140a8 100644 --- a/frame/include/bli_kernel_prototypes.h +++ b/frame/include/bli_kernel_prototypes.h @@ -169,6 +169,11 @@ #define bli_caddv_ker_name BLIS_CADDV_KERNEL #define bli_zaddv_ker_name BLIS_ZADDV_KERNEL +#define bli_samaxv_ker_name BLIS_SAMAXV_KERNEL +#define bli_damaxv_ker_name BLIS_DAMAXV_KERNEL +#define bli_camaxv_ker_name BLIS_CAMAXV_KERNEL +#define bli_zamaxv_ker_name BLIS_ZAMAXV_KERNEL + #define bli_saxpbyv_ker_name BLIS_SAXPBYV_KERNEL #define bli_daxpbyv_ker_name BLIS_DAXPBYV_KERNEL #define bli_caxpbyv_ker_name BLIS_CAXPBYV_KERNEL diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h index 01cf44e79..d99be2345 100644 --- a/frame/include/bli_macro_defs.h +++ b/frame/include/bli_macro_defs.h @@ -120,14 +120,12 @@ #include "bli_gentfunc_macro_defs.h" #include "bli_gentprot_macro_defs.h" -#include "bli_mem_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_complex_macro_defs.h" #include "bli_scalar_macro_defs.h" #include "bli_error_macro_defs.h" #include "bli_blas_macro_defs.h" -#include "bli_auxinfo_macro_defs.h" #endif diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 30c72e735..0d5992900 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -811,21 +812,6 @@ bli_obj_width_stored( obj ) (obj).elem_size = size; \ } - -// Pack mem_t entry query - -#define bli_obj_pack_mem( obj ) \ -\ - ( &((obj).pack_mem) ) - -// Pack mem_t entry modification - -#define bli_obj_set_pack_mem( mem_p, obj ) \ -{ \ - (obj).pack_mem = *mem_p; \ -} - - // Packed matrix info query #define bli_obj_padded_length( obj ) \ @@ -838,6 +824,12 @@ bli_obj_width_stored( obj ) // Packed matrix info modification +#define bli_obj_set_buffer_to_mem( mem_p, obj ) \ +{ \ + void* buf = bli_mem_buffer( mem_p ); \ + bli_obj_set_buffer( buf, obj ); \ +} \ + #define bli_obj_set_padded_length( m0, obj ) \ { \ (obj).m_padded = m0; \ @@ -899,15 +891,7 @@ bli_obj_width_stored( obj ) // -- Miscellaneous object macros -- -// Make a special alias (shallow copy) that does not overwrite pack_mem -// entry. - -#define bli_obj_alias_for_packing( a, b ) \ -{ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -} - -// Make a full alias (shallow copy), including pack_mem and friends +// Make a full alias (shallow copy) #define bli_obj_alias_to( a, b ) \ { \ @@ -947,28 +931,6 @@ bli_obj_width_stored( obj ) } -// Initialize object for packing purposes - -#define bli_obj_init_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *obj_p ); \ -\ - bli_mem_set_buffer( NULL, pack_mem_ ); \ -} - - -// Release object's pack (and cast) memory entries back to memory manager - -#define bli_obj_release_pack( obj_p ) \ -{ \ - mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \ -\ - if ( bli_mem_is_alloc( pack_mem_ ) ) \ - bli_mem_release( pack_mem_ ); \ -} - - - // Submatrix/scalar buffer acquisition #define BLIS_CONSTANT_SLOT_SIZE BLIS_MAX_TYPE_SIZE diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 8869cea17..50ddd5d1f 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -1104,6 +1104,14 @@ else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ } +#define bli_call_ft_3i( dt, fname, o0, o1, o2 ) \ +{ \ + if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2); \ + else if ( bli_is_double( dt ) ) PASTEMAC(d,fname)(o0,o1,o2); \ + else if ( bli_is_scomplex( dt ) ) PASTEMAC(c,fname)(o0,o1,o2); \ + else if ( bli_is_dcomplex( dt ) ) PASTEMAC(z,fname)(o0,o1,o2); \ + else if ( bli_is_int( dt ) ) PASTEMAC(i,fname)(o0,o1,o2); \ +} #define bli_call_ft_4( dt, fname, o0, o1, o2, o3 ) \ { \ if ( bli_is_float( dt ) ) PASTEMAC(s,fname)(o0,o1,o2,o3); \ diff --git a/frame/include/bli_system.h b/frame/include/bli_system.h index 57fe810fc..05139136b 100644 --- a/frame/include/bli_system.h +++ b/frame/include/bli_system.h @@ -40,6 +40,7 @@ #include #include #include +#include // Determine if we are on a 64-bit or 32-bit architecture #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 88ec4a349..4553ff539 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -173,7 +174,6 @@ typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; - // // -- BLIS info bit field offsets ---------------------------------------------- // @@ -501,215 +501,14 @@ typedef enum } packbuf_t; -// -// -- BLIS misc. structure types ----------------------------------------------- -// +// -- Partitioning direction -- -// -- Pool block type -- - -typedef struct +typedef enum { - void* buf_sys; - void* buf_align; -} pblk_t; + BLIS_FWD, + BLIS_BWD +} dir_t; -// -- Pool type -- - -typedef struct -{ - pblk_t* block_ptrs; - dim_t block_ptrs_len; - - dim_t top_index; - dim_t num_blocks; - - siz_t block_size; - siz_t align_size; -} pool_t; - -// -- Memory object type -- - -typedef struct mem_s -{ - pblk_t pblk; - packbuf_t buf_type; - pool_t* pool; - siz_t size; -} mem_t; - -// -- Blocksize object type -- - -typedef struct blksz_s -{ - // Primary blocksize values. - dim_t v[BLIS_NUM_FP_TYPES]; - - // Blocksize extensions. - dim_t e[BLIS_NUM_FP_TYPES]; - -} blksz_t; - -// -- Function pointer object type -- - -typedef struct func_s -{ - // Kernel function address. - void* ptr[BLIS_NUM_FP_TYPES]; - -} func_t; - -// -- Multi-boolean object type -- - -typedef struct mbool_s -{ - bool_t v[BLIS_NUM_FP_TYPES]; - -} mbool_t; - -// -- Auxiliary kernel info type -- - -// Note: This struct is used by macro-kernels to package together extra -// parameter values that may be of use to the micro-kernel without -// cluttering up the micro-kernel interface itself. - -typedef struct -{ - // The pack schemas of A and B. - pack_t schema_a; - pack_t schema_b; - - // Pointers to the micro-panels of A and B which will be used by the - // next call to the micro-kernel. - const void* a_next; - const void* b_next; - - // The imaginary strides of A and B. - inc_t is_a; - inc_t is_b; - -} auxinfo_t; - - - -// -// -- BLIS object type definitions --------------------------------------------- -// - -typedef struct obj_s -{ - // Basic fields - struct obj_s* root; - - dim_t off[2]; - dim_t dim[2]; - doff_t diag_off; - - objbits_t info; - siz_t elem_size; - - void* buffer; - inc_t rs; - inc_t cs; - inc_t is; - - // Bufferless scalar storage - atom_t scalar; - - // Pack-related fields - mem_t pack_mem; // cached memory region for packing - dim_t m_padded; // m dimension of matrix, including any padding - dim_t n_padded; // n dimension of matrix, including any padding - inc_t ps; // panel stride (distance to next panel) - inc_t pd; // panel dimension (the "width" of a panel: - // usually MR or NR) - dim_t m_panel; // m dimension of a "full" panel - dim_t n_panel; // n dimension of a "full" panel -} obj_t; - - -// Define these macros here since they must be updated if contents of -// obj_t changes. -#define bli_obj_init_basic_shallow_copy_of( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - (b).dim[0] = (a).dim[0]; \ - (b).dim[1] = (a).dim[1]; \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We must NOT copy pack_mem field since this macro forms the basis of - bli_obj_alias_to(), which is used in packm_init(). There, we want to - copy the basic fields of the obj_t but PRESERVE the pack_mem field - of the destination object since it holds the "cached" mem_t object - and buffer. The other fields, such as padded dimensions, are always - set by bli_packm_init(), so we don't need to copy them either. */ \ -} - -#define bli_obj_init_full_shallow_copy_of( a, b ) \ -{ \ - /* This macro implements a full alias (shallow copy) that copies all - fields of the obj_t struct. */ \ - bli_obj_init_basic_shallow_copy_of( a, b ); \ -\ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).ps = (a).ps; \ - (b).pd = (a).pd; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - -#define bli_obj_init_subpart_from( a, b ) \ -{ \ - (b).root = (a).root; \ -\ - (b).off[0] = (a).off[0]; \ - (b).off[1] = (a).off[1]; \ - /* Avoid copying m since it will be overwritten. */ \ - /* Avoid copying n since it will be overwritten. */ \ - (b).diag_off = (a).diag_off; \ -\ - (b).info = (a).info; \ - (b).elem_size = (a).elem_size; \ -\ - (b).buffer = (a).buffer; \ - (b).rs = (a).rs; \ - (b).cs = (a).cs; \ - (b).is = (a).is; \ -\ - (b).scalar = (a).scalar; \ -\ - /* We want to copy the pack_mem field here because this macro is used - when creating subpartitions, including those of packed objects. In - those situations, we want the subpartition to inherit the pack_mem - field of its parent, as well as other related fields such as the - padded dimensions. */ \ - (b).pack_mem = (a).pack_mem; \ - (b).m_padded = (a).m_padded; \ - (b).n_padded = (a).n_padded; \ - (b).pd = (a).pd; \ - (b).ps = (a).ps; \ - (b).m_panel = (a).m_panel; \ - (b).n_panel = (a).n_panel; \ -} - - -// -// -- Other BLIS enumerated type definitions ----------------------------------- -// // -- Subpartition type -- @@ -764,6 +563,7 @@ typedef enum #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 + // -- Induced method types -- typedef enum @@ -780,11 +580,13 @@ typedef enum #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) + // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, + BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, @@ -799,7 +601,8 @@ typedef enum BLIS_XPBYV_KER, } l1vkr_t; -#define BLIS_NUM_LEVEL1V_KERS 13 +#define BLIS_NUM_LEVEL1V_KERS 14 + typedef enum { @@ -812,6 +615,7 @@ typedef enum #define BLIS_NUM_LEVEL1F_KERS 5 + typedef enum { BLIS_GEMM_UKR = 0, @@ -823,6 +627,7 @@ typedef enum #define BLIS_NUM_LEVEL3_UKRS 5 + typedef enum { BLIS_REFERENCE_UKERNEL = 0, @@ -834,6 +639,21 @@ typedef enum #define BLIS_NUM_UKR_IMPL_TYPES 4 +#if 0 +typedef enum +{ + BLIS_JC_IDX = 0, + BLIS_PC_IDX, + BLIS_IC_IDX, + BLIS_JR_IDX, + BLIS_IR_IDX, + BLIS_PR_IDX, +} thridx_t; +#endif + +#define BLIS_NUM_LOOPS 6 + + // -- Operation ID type -- typedef enum @@ -884,11 +704,244 @@ typedef enum BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_VF, // level-1v vector fusing factor + + BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 13 +// +// -- BLIS misc. structure types ----------------------------------------------- +// + +// -- Pool block type -- + +typedef struct +{ + void* buf_sys; + void* buf_align; +} pblk_t; + +// -- Pool type -- + +typedef struct +{ + pblk_t* block_ptrs; + dim_t block_ptrs_len; + + dim_t top_index; + dim_t num_blocks; + + siz_t block_size; + siz_t align_size; +} pool_t; + +// -- Mutex object type -- + +#include "bli_mutex.h" +#include "bli_malloc.h" + +// -- Memory broker object type -- + +typedef struct membrk_s +{ + pool_t pools[3]; + mtx_t mutex; + + malloc_ft malloc_fp; + free_ft free_fp; +} membrk_t; + +// -- Memory object type -- + +typedef struct mem_s +{ + pblk_t pblk; + packbuf_t buf_type; + pool_t* pool; + membrk_t* membrk; + siz_t size; +} mem_t; + +// -- Control tree node type -- + +struct cntl_s +{ + // Basic fields (usually required). + bszid_t bszid; + void* var_func; + struct cntl_s* sub_node; + + // Optional fields (needed only by some operations such as packm). + // NOTE: first field of params must be a uint64_t containing the size + // of the struct. + void* params; + + // Internal fields that track "cached" data. + mem_t pack_mem; +}; +typedef struct cntl_s cntl_t; + + +// -- Blocksize object type -- + +typedef struct blksz_s +{ + // Primary blocksize values. + dim_t v[BLIS_NUM_FP_TYPES]; + + // Blocksize extensions. + dim_t e[BLIS_NUM_FP_TYPES]; + +} blksz_t; + + +// -- Function pointer object type -- + +typedef struct func_s +{ + // Kernel function address. + void* ptr[BLIS_NUM_FP_TYPES]; + +} func_t; + + +// -- Multi-boolean object type -- + +typedef struct mbool_s +{ + bool_t v[BLIS_NUM_FP_TYPES]; + +} mbool_t; + + +// -- Auxiliary kernel info type -- + +// Note: This struct is used by macro-kernels to package together extra +// parameter values that may be of use to the micro-kernel without +// cluttering up the micro-kernel interface itself. + +typedef struct +{ + // The pack schemas of A and B. + pack_t schema_a; + pack_t schema_b; + + // Pointers to the micro-panels of A and B which will be used by the + // next call to the micro-kernel. + const void* a_next; + const void* b_next; + + // The imaginary strides of A and B. + inc_t is_a; + inc_t is_b; + +} auxinfo_t; + + +// +// -- BLIS object type definitions --------------------------------------------- +// + +typedef struct obj_s +{ + // Basic fields + struct obj_s* root; + + dim_t off[2]; + dim_t dim[2]; + doff_t diag_off; + + objbits_t info; + siz_t elem_size; + + void* buffer; + inc_t rs; + inc_t cs; + inc_t is; + + // Bufferless scalar storage + atom_t scalar; + + // Pack-related fields + dim_t m_padded; // m dimension of matrix, including any padding + dim_t n_padded; // n dimension of matrix, including any padding + inc_t ps; // panel stride (distance to next panel) + inc_t pd; // panel dimension (the "width" of a panel: + // usually MR or NR) + dim_t m_panel; // m dimension of a "full" panel + dim_t n_panel; // n dimension of a "full" panel +} obj_t; + + +// Define these macros here since they must be updated if contents of +// obj_t changes. + +#define bli_obj_init_full_shallow_copy_of( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + (b).dim[0] = (a).dim[0]; \ + (b).dim[1] = (a).dim[1]; \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).ps = (a).ps; \ + (b).pd = (a).pd; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + +#define bli_obj_init_subpart_from( a, b ) \ +{ \ + (b).root = (a).root; \ +\ + (b).off[0] = (a).off[0]; \ + (b).off[1] = (a).off[1]; \ + /* Avoid copying m since it will be overwritten. */ \ + /* Avoid copying n since it will be overwritten. */ \ + (b).diag_off = (a).diag_off; \ +\ + (b).info = (a).info; \ + (b).elem_size = (a).elem_size; \ +\ + (b).buffer = (a).buffer; \ + (b).rs = (a).rs; \ + (b).cs = (a).cs; \ + (b).is = (a).is; \ +\ + (b).scalar = (a).scalar; \ +\ + /* We want to copy the pack_mem field here because this macro is used + when creating subpartitions, including those of packed objects. In + those situations, we want the subpartition to inherit the pack_mem + field of its parent, as well as other related fields such as the + padded dimensions. */ \ + /*(b).pack_mem = (a).pack_mem;*/ \ + (b).m_padded = (a).m_padded; \ + (b).n_padded = (a).n_padded; \ + (b).pd = (a).pd; \ + (b).ps = (a).ps; \ + (b).m_panel = (a).m_panel; \ + (b).n_panel = (a).n_panel; \ +} + + // -- Context type -- typedef struct cntx_s @@ -905,11 +958,15 @@ typedef struct cntx_s func_t packm_ukrs; + opid_t family; ind_t method; pack_t schema_a; pack_t schema_b; pack_t schema_c; + dim_t thrloop[ BLIS_NUM_LOOPS ]; + + membrk_t* membrk; } cntx_t; diff --git a/frame/include/blis.h b/frame/include/blis.h index 74b3d3233..b7611cd60 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -80,8 +81,6 @@ extern "C" { #include "bli_kernel_prototypes.h" -#include "bli_malloc_prototypes.h" - // -- Threading definitions -- @@ -97,13 +96,14 @@ extern "C" { #include "bli_init.h" #include "bli_const.h" -#include "bli_malloc.h" #include "bli_obj.h" #include "bli_obj_scalar.h" #include "bli_cntx.h" #include "bli_gks.h" #include "bli_ind.h" +#include "bli_membrk.h" #include "bli_pool.h" +#include "bli_memsys.h" #include "bli_mem.h" #include "bli_part.h" #include "bli_prune.h" @@ -111,6 +111,7 @@ extern "C" { #include "bli_blksz.h" #include "bli_func.h" #include "bli_mbool.h" +#include "bli_auxinfo.h" #include "bli_param_map.h" #include "bli_clock.h" #include "bli_check.h" diff --git a/frame/include/level0/bli_gets.h b/frame/include/level0/bli_gets.h index 36e9af5c3..92d018159 100644 --- a/frame/include/level0/bli_gets.h +++ b/frame/include/level0/bli_gets.h @@ -46,27 +46,38 @@ #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } +#define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } + +#define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } +#define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } +#define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } +#define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } +#define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) +#define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif diff --git a/frame/include/level0/bli_sets.h b/frame/include/level0/bli_sets.h index 551d03025..61bd7e426 100644 --- a/frame/include/level0/bli_sets.h +++ b/frame/include/level0/bli_sets.h @@ -45,11 +45,13 @@ #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } +#define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } +#define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX @@ -57,11 +59,13 @@ #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } +#define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } +#define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX @@ -77,11 +81,18 @@ #endif // BLIS_ENABLE_C99_COMPLEX +#define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } +#define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } +#define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } +#define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } +#define bli_iisets( xr, xi, y ) { (y) = (xr); } + #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) +#define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif diff --git a/frame/include/old/bli_kernel_post_macro_defs.h b/frame/include/old/bli_kernel_post_macro_defs.h deleted file mode 100644 index 4a261b033..000000000 --- a/frame/include/old/bli_kernel_post_macro_defs.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_POST_MACRO_DEFS_H -#define BLIS_KERNEL_POST_MACRO_DEFS_H - - -// -- Maximum register blocksize search ---------------------------------------- - -// The macro-kernels oftentimes need to statically allocate a temporary -// MR x NR micro-tile of C. This micro-tile must be sized such that it will -// work for both native and induced implementations, since the user can switch -// between them at runtime. In order to facilitate the sizing of those -// micro-tiles, we must determine the largest the register blocksizes would -// need to be to accommodate both native and induced-based complex -// micro-kernels. For real datatypes, the maximum is never larger than the -// actual s and d register blocksizes. However, for complex datatypes, the -// "native" register blocksizes may differ from the "virtual" register -// blocksizes used by the induced implementations. Usually, it is the register -// blocksizes used for induced-based complex micro-kernels that would be -// larger, and thus determine the maximum for c and z datatypes. But, we -// prefer not to assume this, therefore, we always take the larger of the -// two values. - -#define BLIS_DEFAULT_IND_MR_C BLIS_DEFAULT_MR_S -#define BLIS_DEFAULT_IND_NR_C BLIS_DEFAULT_NR_S -#define BLIS_DEFAULT_IND_MR_Z BLIS_DEFAULT_MR_D -#define BLIS_DEFAULT_IND_NR_Z BLIS_DEFAULT_NR_D - -// -// Find the largest register blocksize MR. -// - -#define BLIS_MAX_DEFAULT_MR_S BLIS_DEFAULT_MR_S -#define BLIS_MAX_DEFAULT_MR_D BLIS_DEFAULT_MR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_MR_C -#if BLIS_DEFAULT_IND_MR_C > BLIS_MAX_DEFAULT_MR_C -#undef BLIS_MAX_DEFAULT_MR_C -#define BLIS_MAX_DEFAULT_MR_C BLIS_DEFAULT_IND_MR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_MR_Z -#if BLIS_DEFAULT_IND_MR_Z > BLIS_MAX_DEFAULT_MR_Z -#undef BLIS_MAX_DEFAULT_MR_Z -#define BLIS_MAX_DEFAULT_MR_Z BLIS_DEFAULT_IND_MR_Z -#endif - -// -// Find the largest register blocksize NR. -// - -#define BLIS_MAX_DEFAULT_NR_S BLIS_DEFAULT_NR_S -#define BLIS_MAX_DEFAULT_NR_D BLIS_DEFAULT_NR_D - -// Choose between the native and induced blocksize for scomplex. -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_NR_C -#if BLIS_DEFAULT_IND_NR_C > BLIS_MAX_DEFAULT_NR_C -#undef BLIS_MAX_DEFAULT_NR_C -#define BLIS_MAX_DEFAULT_NR_C BLIS_DEFAULT_IND_NR_C -#endif - -// Choose between the native and induced blocksize for dcomplex. -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_NR_Z -#if BLIS_DEFAULT_IND_NR_Z > BLIS_MAX_DEFAULT_NR_Z -#undef BLIS_MAX_DEFAULT_NR_Z -#define BLIS_MAX_DEFAULT_NR_Z BLIS_DEFAULT_IND_NR_Z -#endif - - -// -- Abbreiviated macros ------------------------------------------------------ - -// Here, we shorten the maximum blocksizes found above so that they can be -// derived via the PASTEMAC macro. - -// Maximum MR blocksizes - -#define bli_smaxmr BLIS_MAX_DEFAULT_MR_S -#define bli_dmaxmr BLIS_MAX_DEFAULT_MR_D -#define bli_cmaxmr BLIS_MAX_DEFAULT_MR_C -#define bli_zmaxmr BLIS_MAX_DEFAULT_MR_Z - -// Maximum NR blocksizes - -#define bli_smaxnr BLIS_MAX_DEFAULT_NR_S -#define bli_dmaxnr BLIS_MAX_DEFAULT_NR_D -#define bli_cmaxnr BLIS_MAX_DEFAULT_NR_C -#define bli_zmaxnr BLIS_MAX_DEFAULT_NR_Z - - -#endif - diff --git a/frame/include/old/bli_kernel_prototypes.h b/frame/include/old/bli_kernel_prototypes.h deleted file mode 100644 index 333b2c578..000000000 --- a/frame/include/old/bli_kernel_prototypes.h +++ /dev/null @@ -1,529 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_PROTOTYPES_H -#define BLIS_KERNEL_PROTOTYPES_H - - -// -- Define PASTEMAC-friendly kernel function name macros --------------------- - -// -// Level-3 -// - -// gemm micro-kernels - -#define bli_sGEMM_UKERNEL BLIS_SGEMM_UKERNEL -#define bli_dGEMM_UKERNEL BLIS_DGEMM_UKERNEL -#define bli_cGEMM_UKERNEL BLIS_CGEMM_UKERNEL -#define bli_zGEMM_UKERNEL BLIS_ZGEMM_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a, \ - ctype* b, \ - ctype* beta, \ - ctype* c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMM_UKERNEL ) - -// gemmtrsm_l micro-kernels - -#define bli_sGEMMTRSM_L_UKERNEL BLIS_SGEMMTRSM_L_UKERNEL -#define bli_dGEMMTRSM_L_UKERNEL BLIS_DGEMMTRSM_L_UKERNEL -#define bli_cGEMMTRSM_L_UKERNEL BLIS_CGEMMTRSM_L_UKERNEL -#define bli_zGEMMTRSM_L_UKERNEL BLIS_ZGEMMTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a10, \ - ctype* a11, \ - ctype* b01, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_L_UKERNEL ) - -// gemmtrsm_u micro-kernels - -#define bli_sGEMMTRSM_U_UKERNEL BLIS_SGEMMTRSM_U_UKERNEL -#define bli_dGEMMTRSM_U_UKERNEL BLIS_DGEMMTRSM_U_UKERNEL -#define bli_cGEMMTRSM_U_UKERNEL BLIS_CGEMMTRSM_U_UKERNEL -#define bli_zGEMMTRSM_U_UKERNEL BLIS_ZGEMMTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t k, \ - ctype* alpha, \ - ctype* a12, \ - ctype* a11, \ - ctype* b21, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( GEMMTRSM_U_UKERNEL ) - -// trsm_l micro-kernels - -#define bli_sTRSM_L_UKERNEL BLIS_STRSM_L_UKERNEL -#define bli_dTRSM_L_UKERNEL BLIS_DTRSM_L_UKERNEL -#define bli_cTRSM_L_UKERNEL BLIS_CTRSM_L_UKERNEL -#define bli_zTRSM_L_UKERNEL BLIS_ZTRSM_L_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_L_UKERNEL ) - -// trsm_u micro-kernels - -#define bli_sTRSM_U_UKERNEL BLIS_STRSM_U_UKERNEL -#define bli_dTRSM_U_UKERNEL BLIS_DTRSM_U_UKERNEL -#define bli_cTRSM_U_UKERNEL BLIS_CTRSM_U_UKERNEL -#define bli_zTRSM_U_UKERNEL BLIS_ZTRSM_U_UKERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - ctype* a11, \ - ctype* b11, \ - ctype* c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( TRSM_U_UKERNEL ) - - -// -// Level-1m -// - -// NOTE: We don't need any PASTEMAC-friendly aliases to packm kernel -// macros because they are used directly in the initialization of the -// function pointer array, rather than via a templatizing wrapper macro. - - -// -// Level-1f -// - -// axpy2v kernels - -#define bli_sssAXPY2V_KERNEL BLIS_SAXPY2V_KERNEL -#define bli_dddAXPY2V_KERNEL BLIS_DAXPY2V_KERNEL -#define bli_cccAXPY2V_KERNEL BLIS_CAXPY2V_KERNEL -#define bli_zzzAXPY2V_KERNEL BLIS_ZAXPY2V_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha1, \ - ctype_xy* alpha2, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPY2V_KERNEL ) - -// dotaxpyv kernels - -#define bli_sssDOTAXPYV_KERNEL BLIS_SDOTAXPYV_KERNEL -#define bli_dddDOTAXPYV_KERNEL BLIS_DDOTAXPYV_KERNEL -#define bli_cccDOTAXPYV_KERNEL BLIS_CDOTAXPYV_KERNEL -#define bli_zzzDOTAXPYV_KERNEL BLIS_ZDOTAXPYV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_z, ctype_xy, chx, chy, chz, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chz,kername) \ - ( \ - conj_t conjxt, \ - conj_t conjx, \ - conj_t conjy, \ - dim_t m, \ - ctype_x* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_xy* rho, \ - ctype_z* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTAXPYV_KERNEL ) - -// axpyf kernels - -#define bli_sssAXPYF_KERNEL BLIS_SAXPYF_KERNEL -#define bli_dddAXPYF_KERNEL BLIS_DAXPYF_KERNEL -#define bli_cccAXPYF_KERNEL BLIS_CAXPYF_KERNEL -#define bli_zzzAXPYF_KERNEL BLIS_ZAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conja, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( AXPYF_KERNEL ) - -// dotxf kernels - -#define bli_sssDOTXF_KERNEL BLIS_SDOTXF_KERNEL -#define bli_dddDOTXF_KERNEL BLIS_DDOTXF_KERNEL -#define bli_cccDOTXF_KERNEL BLIS_CDOTXF_KERNEL -#define bli_zzzDOTXF_KERNEL BLIS_ZDOTXF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_x, ctype_y, ctype_ax, cha, chx, chy, chax, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjat, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ax* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_x* x, inc_t incx, \ - ctype_y* beta, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXF_KERNEL ) - -// dotxaxpyf kernels - -#define bli_sssDOTXAXPYF_KERNEL BLIS_SDOTXAXPYF_KERNEL -#define bli_dddDOTXAXPYF_KERNEL BLIS_DDOTXAXPYF_KERNEL -#define bli_cccDOTXAXPYF_KERNEL BLIS_CDOTXAXPYF_KERNEL -#define bli_zzzDOTXAXPYF_KERNEL BLIS_ZDOTXAXPYF_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_a, ctype_b, ctype_c, ctype_ab, cha, chb, chc, chab, kername ) \ -\ -void PASTEMAC3(cha,chb,chc,kername) \ - ( \ - conj_t conjat, \ - conj_t conja, \ - conj_t conjw, \ - conj_t conjx, \ - dim_t m, \ - dim_t b_n, \ - ctype_ab* alpha, \ - ctype_a* a, inc_t inca, inc_t lda, \ - ctype_b* w, inc_t incw, \ - ctype_b* x, inc_t incx, \ - ctype_c* beta, \ - ctype_c* y, inc_t incy, \ - ctype_c* z, inc_t incz \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXAXPYF_KERNEL ) - - -// -// Level-1v -// - -// addv kernels - -#define bli_ssADDV_KERNEL BLIS_SADDV_KERNEL -#define bli_ddADDV_KERNEL BLIS_DADDV_KERNEL -#define bli_ccADDV_KERNEL BLIS_CADDV_KERNEL -#define bli_zzADDV_KERNEL BLIS_ZADDV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( ADDV_KERNEL ) - -// axpyv kernels - -#define bli_sssAXPYV_KERNEL BLIS_SAXPYV_KERNEL -#define bli_dddAXPYV_KERNEL BLIS_DAXPYV_KERNEL -#define bli_cccAXPYV_KERNEL BLIS_CAXPYV_KERNEL -#define bli_zzzAXPYV_KERNEL BLIS_ZAXPYV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_a, ctype_x, ctype_y, cha, chx, chy, kername ) \ -\ -void PASTEMAC3(cha,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_a* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( AXPYV_KERNEL ) - -// copyv kernels - -#define bli_ssCOPYV_KERNEL BLIS_SCOPYV_KERNEL -#define bli_ddCOPYV_KERNEL BLIS_DCOPYV_KERNEL -#define bli_ccCOPYV_KERNEL BLIS_CCOPYV_KERNEL -#define bli_zzCOPYV_KERNEL BLIS_ZCOPYV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( COPYV_KERNEL ) - -// dotv kernels - -#define bli_sssDOTV_KERNEL BLIS_SDOTV_KERNEL -#define bli_dddDOTV_KERNEL BLIS_DDOTV_KERNEL -#define bli_cccDOTV_KERNEL BLIS_CDOTV_KERNEL -#define bli_zzzDOTV_KERNEL BLIS_ZDOTV_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_x, ctype_y, ctype_r, chx, chy, chr, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3_BASIC( DOTV_KERNEL ) - -// dotxv kernels - -#define bli_sssDOTXV_KERNEL BLIS_SDOTXV_KERNEL -#define bli_dddDOTXV_KERNEL BLIS_DDOTXV_KERNEL -#define bli_cccDOTXV_KERNEL BLIS_CDOTXV_KERNEL -#define bli_zzzDOTXV_KERNEL BLIS_ZDOTXV_KERNEL - -#undef GENTPROT3U12 -#define GENTPROT3U12( ctype_x, ctype_y, ctype_r, ctype_xy, chx, chy, chr, chxy, kername ) \ -\ -void PASTEMAC3(chx,chy,chr,kername) \ - ( \ - conj_t conjx, \ - conj_t conjy, \ - dim_t n, \ - ctype_xy* alpha, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy, \ - ctype_r* beta, \ - ctype_r* rho \ - ); - -INSERT_GENTPROT3U12_BASIC( DOTXV_KERNEL ) - -// invertv kernels - -#define bli_sINVERTV_KERNEL BLIS_SINVERTV_KERNEL -#define bli_dINVERTV_KERNEL BLIS_DINVERTV_KERNEL -#define bli_cINVERTV_KERNEL BLIS_CINVERTV_KERNEL -#define bli_zINVERTV_KERNEL BLIS_ZINVERTV_KERNEL - -#undef GENTPROT -#define GENTPROT( ctype, ch, kername ) \ -\ -void PASTEMAC(ch,kername) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx \ - ); - -INSERT_GENTPROT_BASIC( INVERTV_KERNEL ) - -// scal2v kernels - -#define bli_sssSCAL2V_KERNEL BLIS_SSCAL2V_KERNEL -#define bli_dddSCAL2V_KERNEL BLIS_DSCAL2V_KERNEL -#define bli_cccSCAL2V_KERNEL BLIS_CSCAL2V_KERNEL -#define bli_zzzSCAL2V_KERNEL BLIS_ZSCAL2V_KERNEL - -#undef GENTPROT3 -#define GENTPROT3( ctype_b, ctype_x, ctype_y, chb, chx, chy, kername ) \ -\ -void PASTEMAC3(chb,chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT3_BASIC( SCAL2V_KERNEL ) - -// scalv kernels - -#define bli_ssSCALV_KERNEL BLIS_SSCALV_KERNEL -#define bli_ddSCALV_KERNEL BLIS_DSCALV_KERNEL -#define bli_ccSCALV_KERNEL BLIS_CSCALV_KERNEL -#define bli_zzSCALV_KERNEL BLIS_ZSCALV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - conj_t conjbeta, \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SCALV_KERNEL ) - -// setv kernels - -#define bli_ssSETV_KERNEL BLIS_SSETV_KERNEL -#define bli_ddSETV_KERNEL BLIS_DSETV_KERNEL -#define bli_ccSETV_KERNEL BLIS_CSETV_KERNEL -#define bli_zzSETV_KERNEL BLIS_ZSETV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_b, ctype_x, chb, chx, kername ) \ -\ -void PASTEMAC2(chb,chx,kername) \ - ( \ - dim_t n, \ - ctype_b* beta, \ - ctype_x* x, inc_t incx \ - ); - -INSERT_GENTPROT2_BASIC( SETV_KERNEL ) - -// subv kernels - -#define bli_ssSUBV_KERNEL BLIS_SSUBV_KERNEL -#define bli_ddSUBV_KERNEL BLIS_DSUBV_KERNEL -#define bli_ccSUBV_KERNEL BLIS_CSUBV_KERNEL -#define bli_zzSUBV_KERNEL BLIS_ZSUBV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - conj_t conjx, \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SUBV_KERNEL ) - -// swapv kernels - -#define bli_ssSWAPV_KERNEL BLIS_SSWAPV_KERNEL -#define bli_ddSWAPV_KERNEL BLIS_DSWAPV_KERNEL -#define bli_ccSWAPV_KERNEL BLIS_CSWAPV_KERNEL -#define bli_zzSWAPV_KERNEL BLIS_ZSWAPV_KERNEL - -#undef GENTPROT2 -#define GENTPROT2( ctype_x, ctype_y, chx, chy, kername ) \ -\ -void PASTEMAC2(chx,chy,kername) \ - ( \ - dim_t n, \ - ctype_x* x, inc_t incx, \ - ctype_y* y, inc_t incy \ - ); - -INSERT_GENTPROT2_BASIC( SWAPV_KERNEL ) - - - -#endif - diff --git a/frame/include/old/bli_kernel_type_defs.h b/frame/include/old/bli_kernel_type_defs.h deleted file mode 100644 index e0190fe1b..000000000 --- a/frame/include/old/bli_kernel_type_defs.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef BLIS_KERNEL_TYPE_DEFS_H -#define BLIS_KERNEL_TYPE_DEFS_H - - -// -// -- BLIS kernel types -------------------------------------------------------- -// - -// Here we generate typedef statements that generate custom types for -// kernel function pointers. Note that we use the function -// prototype-generating macro since it takes the same arguments we need -// to define our types. - -// -- Level-3 kernels -- - -/* -// gemm - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemm_ukr_t ) - - -// trsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - ctype* restrict a, \ - ctype* restrict b, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( trsm_ukr_t ) - - -// gemmtrsm_l/u - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a1x, \ - ctype* restrict a11, \ - ctype* restrict bx1, \ - ctype* restrict b11, \ - ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* data \ - ); - -INSERT_GENTPROT_BASIC( gemmtrsm_ukr_t ) - -// -- packm kernels -- - -// packm_struc_cxk - -#undef GENTPROT -#define GENTPROT( ctype, ch, tname ) \ -\ -typedef void \ -(*PASTECH(ch,tname))( \ - struc_t strucc, \ - doff_t diagoffc, \ - diag_t diagc, \ - uplo_t uploc, \ - conj_t conjc, \ - pack_t schema, \ - bool_t invdiag, \ - dim_t m_panel, \ - dim_t n_panel, \ - dim_t m_panel_max, \ - dim_t n_panel_max, \ - ctype* restrict kappa, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - ctype* restrict p, inc_t rs_p, inc_t cs_p, \ - inc_t is_p \ - ); - -INSERT_GENTPROT_BASIC( packm_ker_t ) -*/ - - - -#endif - diff --git a/frame/ind/oapi/bli_l3_3m4m_oapi.c b/frame/ind/oapi/bli_l3_3m4m_oapi.c index 04f2259d2..40348e627 100644 --- a/frame/ind/oapi/bli_l3_3m4m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m_oapi.c @@ -34,12 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - - // -- gemm/her2k/syr2k --------------------------------------------------------- #undef GENFRONT @@ -81,10 +75,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -161,10 +154,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -239,10 +231,9 @@ void PASTEMAC(opname,imeth) \ stage. */ \ if ( i > 0 ) beta_use = &BLIS_ONE; \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( alpha, a, beta_use, c, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -302,10 +293,9 @@ void PASTEMAC(opname,imeth) \ /* Prepare the context for the ith stage of computation. */ \ PASTEMAC2(cname,imeth,_cntx_stage)( i, cntx_p ); \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -353,11 +343,9 @@ void PASTEMAC(opname,imeth) \ /* NOTE: trsm cannot be implemented via any induced method that needs to execute in stages (e.g. 3mh, 4mh). */ \ \ - /* Invoke the operation's front end with the appropriate control + /* Invoke the operation's front end and request the default control tree. */ \ - PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) ); \ + PASTEMAC(opname,_front)( side, alpha, a, b, cntx_p, NULL ); \ } \ \ /* Finalize the local context if it was initialized here. */ \ @@ -373,10 +361,3 @@ GENFRONT( trsm, trsm, 3m1, 1 ) //GENFRONT( trmm, trsm, 4mb, 1 ) // Unimplementable. GENFRONT( trsm, trsm, 4m1, 1 ) - -// -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// ----------------------------------------------------------------------------- -// - diff --git a/frame/ind/oapi/bli_l3_nat_oapi.c b/frame/ind/oapi/bli_l3_nat_oapi.c index 9038067c5..68b664d65 100644 --- a/frame/ind/oapi/bli_l3_nat_oapi.c +++ b/frame/ind/oapi/bli_l3_nat_oapi.c @@ -34,11 +34,6 @@ #include "blis.h" -// Bring control trees into scope. -extern gemm_t* gemm_cntl; -extern trsm_t* trsm_l_cntl; -extern trsm_t* trsm_r_cntl; - // NOTE: The function definitions in this file can be consolidated with the // definitions for the other induced methods. The only advantage of keeping // them separate is that it allows us to avoid the very small loop overhead @@ -69,8 +64,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -107,8 +101,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -143,8 +136,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - alpha, a, beta, c, cntx_p, \ - PASTECH(cname,_cntl) \ + alpha, a, beta, c, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -178,8 +170,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ @@ -212,9 +203,7 @@ void PASTEMAC(opname,imeth) \ tree. */ \ PASTEMAC(opname,_front) \ ( \ - side, alpha, a, b, cntx_p, \ - PASTECH(cname,_l_cntl), \ - PASTECH(cname,_r_cntl) \ + side, alpha, a, b, cntx_p, NULL \ ); \ \ /* Finalize the local context if it was initialized here. */ \ diff --git a/frame/3/herk/old/bli_herk_blk_var2f.h b/frame/thread/bli_mutex.h similarity index 84% rename from frame/3/herk/old/bli_herk_blk_var2f.h rename to frame/thread/bli_mutex.h index f436a0082..5ccfebe63 100644 --- a/frame/3/herk/old/bli_herk_blk_var2f.h +++ b/frame/thread/bli_mutex.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -32,10 +33,17 @@ */ -void bli_herk_blk_var2f( obj_t* a, - obj_t* ah, - obj_t* c, - cntx_t* cntx, - gemm_t* cntl, - herk_thrinfo_t* thread ); +#ifndef BLIS_MUTEX_H +#define BLIS_MUTEX_H + +// Include definitions (mostly mtx_t) specific to the method of +// multithreading. +#include "bli_mutex_single.h" +#include "bli_mutex_openmp.h" +#include "bli_mutex_pthreads.h" + +// Thread mutex prototypes. + + +#endif diff --git a/frame/thread/bli_mutex_openmp.h b/frame/thread/bli_mutex_openmp.h new file mode 100644 index 000000000..cb13df5d3 --- /dev/null +++ b/frame/thread/bli_mutex_openmp.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_OPENMP_H +#define BLIS_MUTEX_OPENMP_H + +// Define mutex_t for situations when OpenMP multithreading is enabled. +#ifdef BLIS_ENABLE_OPENMP + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + omp_lock_t mutex; +} mtx_t; + +// Define macros to operate on OpenMP-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + omp_init_lock( &((mtx_p)->mutex) ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + omp_destroy_lock( &((mtx_p)->mutex) ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + omp_set_lock( &((mtx_p)->mutex) ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + omp_unset_lock( &((mtx_p)->mutex) ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_pthreads.h b/frame/thread/bli_mutex_pthreads.h new file mode 100644 index 000000000..328f9fd6b --- /dev/null +++ b/frame/thread/bli_mutex_pthreads.h @@ -0,0 +1,72 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_PTHREADS_H +#define BLIS_MUTEX_PTHREADS_H + +// Define mutex_t for situations when POSIX multithreading is enabled. +#ifdef BLIS_ENABLE_PTHREADS + +#include + +// Define mtx_t. +typedef struct mtx_s +{ + pthread_mutex_t mutex; +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ + pthread_mutex_init( &((mtx_p)->mutex), NULL ); \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ + pthread_mutex_destroy( &((mtx_p)->mutex) ); \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ + pthread_mutex_lock( &((mtx_p)->mutex) ); \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ + pthread_mutex_unlock( &((mtx_p)->mutex) ); \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_mutex_single.h b/frame/thread/bli_mutex_single.h new file mode 100644 index 000000000..26aefcc21 --- /dev/null +++ b/frame/thread/bli_mutex_single.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_MUTEX_SINGLE_H +#define BLIS_MUTEX_SINGLE_H + +// Define mtx_t for situations when multithreading is disabled. +#ifndef BLIS_ENABLE_MULTITHREADING + +// Define mtx_t. +typedef struct mtx_s +{ +} mtx_t; + +// Define macros to operate on pthread-based mtx_t. +#define bli_mutex_init( mtx_p ) \ +{ \ +} +#define bli_mutex_finalize( mtx_p ) \ +{ \ +} + +#define bli_mutex_lock( mtx_p ) \ +{ \ +} +#define bli_mutex_unlock( mtx_p ) \ +{ \ +} + +#endif + +#endif + diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 6b4d2de1a..593f8d7fa 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -41,6 +41,12 @@ #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" + +// thrcomm_t query (field only) + +#define bli_thrcomm_num_threads( comm ) ( (comm)->n_threads ) + + // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( dim_t n_threads ); void bli_thrcomm_free( thrcomm_t* communicator ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 04f0c34a8..0882d1659 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -199,23 +199,42 @@ void bli_thrcomm_tree_barrier( barrier_t* barack ) #endif +//#define PRINT_THRINFO + void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl ) { + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + +#ifdef PRINT_THRINFO + thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) ); +#endif + _Pragma( "omp parallel num_threads(n_threads)" ) { - dim_t omp_id = omp_get_thread_num(); + dim_t id = omp_get_thread_num(); + + cntl_t* cntl_use; + thrinfo_t* thread; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); func ( @@ -225,10 +244,31 @@ void bli_l3_thread_decorator beta, c, cntx, - cntl, - thread[omp_id] + cntl_use, + thread ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + +#ifdef PRINT_THRINFO + threads[id] = thread; +#else + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); +#endif } + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). + + +#ifdef PRINT_THRINFO + bli_l3_thrinfo_print_paths( threads ); + bli_l3_thrinfo_free_paths( threads ); + exit(1); +#endif } #endif diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index 42a9c6979..230b63905 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -80,7 +80,7 @@ void bli_thrcomm_init( thrcomm_t* communicator, dim_t n_threads) communicator->n_threads = n_threads; communicator->sense = 0; communicator->threads_arrived = 0; - + #ifdef BLIS_USE_PTHREAD_MUTEX pthread_mutex_init( &communicator->mutex, NULL ); #endif @@ -123,78 +123,122 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) #endif -void* thread_decorator_helper( void* data_void ); +void* bli_l3_thread_entry( void* data_void ); +// A data structure to assist in passing operands to additional threads. typedef struct thread_data { - l3_int_t func; - obj_t* alpha; - obj_t* a; - obj_t* b; - obj_t* beta; - obj_t* c; - void* cntx; - void* cntl; - void* thread; + l3int_t func; + obj_t* alpha; + obj_t* a; + obj_t* b; + obj_t* beta; + obj_t* c; + cntx_t* cntx; + cntl_t* cntl; + dim_t id; + thrcomm_t* gl_comm; } thread_data_t; -void* thread_decorator_helper( void* data_void ) +// Entry point for additional threads +void* bli_l3_thread_entry( void* data_void ) { - thread_data_t* data = data_void; + thread_data_t* data = data_void; + + obj_t* alpha = data->alpha; + obj_t* a = data->a; + obj_t* b = data->b; + obj_t* beta = data->beta; + obj_t* c = data->c; + cntx_t* cntx = data->cntx; + cntl_t* cntl = data->cntl; + dim_t id = data->id; + thrcomm_t* gl_comm = data->gl_comm; + + cntl_t* cntl_use; + thrinfo_t* thread; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + // Create the root node of the current thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); data->func ( - data->alpha, - data->a, - data->b, - data->beta, - data->c, - data->cntx, - data->cntl, - data->thread + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread ); + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); + return NULL; } void bli_l3_thread_decorator ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl ) { - pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); - thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + // Query the total number of threads from the context. + dim_t n_threads = bli_cntx_get_num_threads( cntx ); - for ( int i = 1; i < n_threads; i++ ) + // Allocate an array of pthread objects and auxiliary data structs to pass + // to the thread entry functions. + pthread_t* pthreads = bli_malloc_intl( sizeof( pthread_t ) * n_threads ); + thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads ); + + // Allocate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + // NOTE: We must iterate backwards so that the chief thread (thread id 0) + // can spawn all other threads before proceeding with its own computation. + for ( dim_t id = n_threads - 1; 0 <= id; id-- ) { - //Setup the thread data - datas[i].func = func; - datas[i].alpha = alpha; - datas[i].a = a; - datas[i].b = b; - datas[i].beta = beta; - datas[i].c = c; - datas[i].cntx = cntx; - datas[i].cntl = cntl; - datas[i].thread = thread[i]; + // Set up thread data for additional threads (beyond thread 0). + datas[id].func = func; + datas[id].alpha = alpha; + datas[id].a = a; + datas[id].b = b; + datas[id].beta = beta; + datas[id].c = c; + datas[id].cntx = cntx; + datas[id].cntl = cntl; + datas[id].id = id; + datas[id].gl_comm = gl_comm; - pthread_create( &pthreads[i], NULL, &thread_decorator_helper, &datas[i] ); + // Spawn additional threads for ids greater than 1. + if ( id != 0 ) + pthread_create( &pthreads[id], NULL, &bli_l3_thread_entry, &datas[id] ); + else + bli_l3_thread_entry( ( void* )(&datas[0]) ); } - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called from the thread entry function). - for ( int i = 1; i < n_threads; i++) + // Thread 0 waits for additional threads to finish. + for ( dim_t id = 1; id < n_threads; id++ ) { - pthread_join( pthreads[i], NULL ); + pthread_join( pthreads[id], NULL ); } bli_free_intl( pthreads ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index fb2bc97bb..c038f59a0 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -36,24 +36,6 @@ #ifndef BLIS_ENABLE_MULTITHREADING -void bli_l3_thread_decorator - ( - dim_t n_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread - ) -{ - func( alpha, a, b, beta, c, cntx, cntl, thread[0] ); -} - - //Constructors and destructors for constructors thrcomm_t* bli_thrcomm_create( dim_t n_threads ) { @@ -89,5 +71,57 @@ void bli_thrcomm_barrier( thrcomm_t* communicator, dim_t t_id ) return; } +void bli_l3_thread_decorator + ( + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl + ) +{ + // For sequential execution, we use only one thread. + dim_t n_threads = 1; + dim_t id = 0; + + // Allcoate a global communicator for the root thrinfo_t structures. + thrcomm_t* gl_comm = bli_thrcomm_create( n_threads ); + + cntl_t* cntl_use; + thrinfo_t* thread; + + // Create a default control tree for the operation, if needed. + bli_l3_cntl_create_if( a, b, c, cntx, cntl, &cntl_use ); + + // Create the root node of the thread's thrinfo_t structure. + bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); + + func + ( + alpha, + a, + b, + beta, + c, + cntx, + cntl_use, + thread + ); + + // Free the control tree, if one was created locally. + bli_l3_cntl_free_if( a, b, c, cntx, cntl, cntl_use, thread ); + + // Free the current thread's thrinfo_t structure. + bli_l3_thrinfo_free( thread ); + + // We shouldn't free the global communicator since it was already freed + // by the global communicator's chief thread in bli_l3_thrinfo_free() + // (called above). +} + + #endif diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 28cbab2c1..30614ff73 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -38,7 +38,6 @@ static bool_t bli_thread_is_init = FALSE; thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; -thrinfo_t BLIS_HERK_SINGLE_THREADED = {}; thrcomm_t BLIS_SINGLE_COMM = {}; // ----------------------------------------------------------------------------- @@ -51,7 +50,6 @@ void bli_thread_init( void ) bli_thrcomm_init( &BLIS_SINGLE_COMM, 1 ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); - bli_l3_thrinfo_init_single( &BLIS_HERK_SINGLE_THREADED ); // Mark API as initialized. bli_thread_is_init = TRUE; @@ -70,7 +68,7 @@ bool_t bli_thread_is_initialized( void ) // ----------------------------------------------------------------------------- -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -80,8 +78,8 @@ void bli_thread_get_range dim_t* end ) { - dim_t n_way = thread->n_way; - dim_t work_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; @@ -224,8 +222,8 @@ siz_t bli_thread_get_range_l2r dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + FALSE, start, end ); return m * ( *end - *start ); } @@ -243,8 +241,8 @@ siz_t bli_thread_get_range_r2l dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, n, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, n, bf, + TRUE, start, end ); return m * ( *end - *start ); } @@ -262,8 +260,8 @@ siz_t bli_thread_get_range_t2b dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - FALSE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + FALSE, start, end ); return n * ( *end - *start ); } @@ -281,12 +279,14 @@ siz_t bli_thread_get_range_b2t dim_t n = bli_obj_width_after_trans( *a ); dim_t bf = bli_blksz_get_def_for_obj( a, bmult ); - bli_thread_get_range( thr, m, bf, - TRUE, start, end ); + bli_thread_get_range_sub( thr, m, bf, + TRUE, start, end ); return n * ( *end - *start ); } +// ----------------------------------------------------------------------------- + dim_t bli_thread_get_range_width_l ( doff_t diagoff_j, @@ -496,7 +496,9 @@ siz_t bli_find_area_trap_l return ( siz_t )area; } -siz_t bli_thread_get_range_weighted +// ----------------------------------------------------------------------------- + +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, @@ -509,8 +511,8 @@ siz_t bli_thread_get_range_weighted dim_t* j_end_thr ) { - dim_t n_way = thread->n_way; - dim_t my_id = thread->work_id; + dim_t n_way = bli_thread_n_way( thread ); + dim_t my_id = bli_thread_work_id( thread ); dim_t bf_left = n % bf; @@ -570,11 +572,15 @@ siz_t bli_thread_get_range_weighted { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. - width_j = bli_thread_get_range_width_l( diagoff_j, m, n_left, - j, n_way, - bf, bf_left, - area_per_thr, - handle_edge_low ); + width_j = + bli_thread_get_range_width_l + ( + diagoff_j, m, n_left, + j, n_way, + bf, bf_left, + area_per_thr, + handle_edge_low + ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and @@ -611,9 +617,12 @@ siz_t bli_thread_get_range_weighted bli_toggle_bool( handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_get_range_weighted( thread, diagoff, uplo, m, n, bf, - handle_edge_low, - j_start_thr, j_end_thr ); + area = bli_thread_get_range_weighted_sub + ( + thread, diagoff, uplo, m, n, bf, + handle_edge_low, + j_start_thr, j_end_thr + ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the @@ -626,6 +635,124 @@ siz_t bli_thread_get_range_weighted return area; } +siz_t bli_thread_get_range_mdim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *a ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_t2b( thr, x, bmult, start, end ); + else + return bli_thread_get_range_b2t( thr, x, bmult, start, end ); + } +} + +siz_t bli_thread_get_range_ndim + ( + dir_t direct, + thrinfo_t* thr, + obj_t* a, + obj_t* b, + obj_t* c, + cntl_t* cntl, + cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntx_get_family( cntx ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( *b ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + obj_t* x; + bool_t use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } + else if ( family == BLIS_HERK ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_get_range_l2r( thr, x, bmult, start, end ); + else + return bli_thread_get_range_r2l( thr, x, bmult, start, end ); + } +} + siz_t bli_thread_get_range_weighted_l2r ( thrinfo_t* thr, @@ -656,13 +783,20 @@ siz_t bli_thread_get_range_weighted_l2r bli_reflect_about_diag( diagoff, uplo, m, n ); } - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_l2r( thr, a, bmult, - start, end ); + area = bli_thread_get_range_l2r + ( + thr, a, bmult, + start, end + ); } return area; @@ -700,13 +834,20 @@ siz_t bli_thread_get_range_weighted_r2l bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_r2l( thr, a, bmult, - start, end ); + area = bli_thread_get_range_r2l + ( + thr, a, bmult, + start, end + ); } return area; @@ -744,13 +885,20 @@ siz_t bli_thread_get_range_weighted_t2b bli_reflect_about_diag( diagoff, uplo, m, n ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - FALSE, start, end ); + area = + bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + FALSE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_t2b( thr, a, bmult, - start, end ); + area = bli_thread_get_range_t2b + ( + thr, a, bmult, + start, end + ); } return area; @@ -790,18 +938,25 @@ siz_t bli_thread_get_range_weighted_b2t bli_rotate180_trapezoid( diagoff, uplo ); - area = bli_thread_get_range_weighted( thr, diagoff, uplo, m, n, bf, - TRUE, start, end ); + area = bli_thread_get_range_weighted_sub + ( + thr, diagoff, uplo, m, n, bf, + TRUE, start, end + ); } else // if dense or zeros { - area = bli_thread_get_range_b2t( thr, a, bmult, - start, end ); + area = bli_thread_get_range_b2t + ( + thr, a, bmult, + start, end + ); } return area; } +// ----------------------------------------------------------------------------- // Some utilities dim_t bli_env_read_nway( const char* env ) diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index d29f6f96a..3d32872b5 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -5,6 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016 Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -35,22 +36,6 @@ #ifndef BLIS_THREAD_H #define BLIS_THREAD_H -// Perform a sanity check to make sure the user doesn't try to enable -// both OpenMP and pthreads. -#if defined ( BLIS_ENABLE_OPENMP ) && \ - defined ( BLIS_ENABLE_PTHREADS ) - #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." -#endif - -// Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP -// or pthreads are enabled. This macro is useful in situations when -// we want to detect use of either OpenMP or pthreads (as opposed -// to neither being used). -#if defined ( BLIS_ENABLE_OPENMP ) || \ - defined ( BLIS_ENABLE_PTHREADS ) - #define BLIS_ENABLE_MULTITHREADING -#endif - // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" @@ -68,7 +53,7 @@ void bli_thread_finalize( void ); bool_t bli_thread_is_initialized( void ); // Thread range-related prototypes. -void bli_thread_get_range +void bli_thread_get_range_sub ( thrinfo_t* thread, dim_t n, @@ -78,6 +63,25 @@ void bli_thread_get_range dim_t* end ); +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + thrinfo_t* thr, \ + obj_t* a, \ + obj_t* b, \ + obj_t* c, \ + cntl_t* cntl, \ + cntx_t* cntx, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_get_range_mdim ) +GENPROT( thread_get_range_ndim ) + #undef GENPROT #define GENPROT( opname ) \ \ @@ -119,7 +123,7 @@ siz_t bli_find_area_trap_l dim_t n, doff_t diagoff ); -siz_t bli_thread_get_range_weighted +siz_t bli_thread_get_range_weighted_sub ( thrinfo_t* thread, doff_t diagoff, @@ -135,31 +139,29 @@ siz_t bli_thread_get_range_weighted // Level-3 internal function type -typedef void (*l3_int_t) +typedef void (*l3int_t) ( - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void* thread + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread ); // Level-3 thread decorator prototype void bli_l3_thread_decorator ( - dim_t num_threads, - l3_int_t func, - obj_t* alpha, - obj_t* a, - obj_t* b, - obj_t* beta, - obj_t* c, - void* cntx, - void* cntl, - void** thread + l3int_t func, + obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl ); // Miscellaneous prototypes diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index e47006954..bad5c2772 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -38,13 +38,10 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comm, + thrinfo_t* sub_node ) { thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) ); @@ -53,11 +50,9 @@ thrinfo_t* bli_thrinfo_create ( thread, ocomm, ocomm_id, - icomm, icomm_id, n_way, work_id, - opackm, - ipackm, - sub_self + free_comm, + sub_node ); return thread; @@ -68,25 +63,19 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comm, + thrinfo_t* sub_node ) { - thread->ocomm = ocomm; - thread->ocomm_id = ocomm_id; - thread->icomm = icomm; - thread->icomm_id = icomm_id; - thread->n_way = n_way; - thread->work_id = work_id; + thread->ocomm = ocomm; + thread->ocomm_id = ocomm_id; + thread->n_way = n_way; + thread->work_id = work_id; + thread->free_comm = free_comm; - thread->opackm = opackm; - thread->ipackm = ipackm; - thread->sub_self = sub_self; + thread->sub_node = sub_node; } void bli_thrinfo_init_single @@ -98,40 +87,185 @@ void bli_thrinfo_init_single ( thread, &BLIS_SINGLE_COMM, 0, - &BLIS_SINGLE_COMM, 0, 1, 0, - &BLIS_PACKM_SINGLE_THREADED, - &BLIS_PACKM_SINGLE_THREADED, + FALSE, thread ); } -#if 0 -void bli_thrinfo_free +// ----------------------------------------------------------------------------- + +#include "assert.h" + +#define BLIS_NUM_STATIC_COMMS 18 + +thrinfo_t* bli_thrinfo_create_for_cntl ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ) +{ + thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; + thrcomm_t** new_comms = NULL; + + thrinfo_t* thread_chl; + + bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); + + dim_t parent_nt_in = bli_thread_num_threads( thread_par ); + dim_t parent_n_way = bli_thread_n_way( thread_par ); + dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); + dim_t parent_work_id = bli_thread_work_id( thread_par ); + + dim_t child_nt_in; + dim_t child_comm_id; + dim_t child_n_way; + dim_t child_work_id; + + // Sanity check: make sure the number of threads in the parent's + // communicator is divisible by the number of new sub-groups. + assert( parent_nt_in % parent_n_way == 0 ); + + // Compute: + // - the number of threads inside the new child comm, + // - the current thread's id within the new communicator, + // - the current thread's work id, given the ways of parallelism + // to be obtained within the next loop. + child_nt_in = bli_cntx_get_num_threads_in( cntx, cntl_chl ); + child_n_way = bli_cntx_way_for_bszid( bszid_chl, cntx ); + child_comm_id = parent_comm_id % child_nt_in; + child_work_id = child_comm_id / ( child_nt_in / child_n_way ); + + // The parent's chief thread creates a temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ) ); + else + new_comms = static_comms; + } + + // Broadcast the temporary array to all threads in the parent's + // communicator. + new_comms = bli_thread_obroadcast( thread_par, new_comms ); + + // Chiefs in the child communicator allocate the communicator + // object and store it in the array element corresponding to the + // parent's work id. + if ( child_comm_id == 0 ) + new_comms[ parent_work_id ] = bli_thrcomm_create( child_nt_in ); + + bli_thread_obarrier( thread_par ); + + // All threads create a new thrinfo_t node using the communicator + // that was created by their chief, as identified by parent_work_id. + thread_chl = bli_thrinfo_create + ( + new_comms[ parent_work_id ], + child_comm_id, + child_n_way, + child_work_id, + TRUE, + NULL + ); + + bli_thread_obarrier( thread_par ); + + // The parent's chief thread frees the temporary array of thrcomm_t + // pointers. + if ( bli_thread_am_ochief( thread_par ) ) + { + if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) + bli_free_intl( new_comms ); + } + + return thread_chl; +} + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, thrinfo_t* thread ) { - if ( thread == NULL || - thread == &BLIS_GEMM_SINGLE_THREADED || - thread == &BLIS_HERK_SINGLE_THREADED || - thread == &BLIS_PACKM_SINGLE_THREADED - ) return; + // If the sub-node of the thrinfo_t object is non-NULL, we don't + // need to create it, and will just use the existing sub-node as-is. + if ( bli_thrinfo_sub_node( thread ) != NULL ) return; - // Free Communicators - if ( bli_thread_am_ochief( thread ) ) - bli_thrcomm_free( thread->ocomm ); - if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) ) - bli_thrcomm_free( thread->icomm ); + // Create a new node (or, if needed, multiple nodes) and return the + // pointer to the (eldest) child. + thrinfo_t* thread_child = bli_thrinfo_rgrow + ( + cntx, + cntl, + bli_cntl_sub_node( cntl ), + thread + ); - // Free thrinfo chidren - bli_packm_thrinfo_free( thread->opackm ); - bli_packm_thrinfo_free( thread->ipackm ); - bli_l3_thrinfo_free( thread->sub_self ); - bli_free_intl( thread ); - - return; + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_child, thread ); +} + +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ) +{ + thrinfo_t* thread_cur; + + // We must handle two cases: those where the next node in the + // control tree is a partitioning node, and those where it is + // a non-partitioning (ie: packing) node. + if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) + { + // Create the child thrinfo_t node corresponding to cntl_cur, + // with cntl_par being the parent. + thread_cur = bli_thrinfo_create_for_cntl + ( + cntx, + cntl_par, + cntl_cur, + thread_par + ); + } + else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) + { + // Recursively grow the thread structure and return the top-most + // thrinfo_t node of that segment. + thrinfo_t* thread_seg = bli_thrinfo_rgrow + ( + cntx, + cntl_par, + bli_cntl_sub_node( cntl_cur ), + thread_par + ); + + // Create a thrinfo_t node corresponding to cntl_cur. Notice that + // the free_comm field is set to FALSE, since cntl_cur is a + // non-partitioning node. The communicator used here will be + // freed when thread_seg, or one of its descendents, is freed. + thread_cur = bli_thrinfo_create + ( + bli_thrinfo_ocomm( thread_seg ), + bli_thread_ocomm_id( thread_seg ), + bli_cntx_get_num_threads_in( cntx, cntl_cur ), + bli_thread_ocomm_id( thread_seg ), + FALSE, + thread_seg + ); + + // Attach the child thrinfo_t node to its parent structure. + bli_thrinfo_set_sub_node( thread_cur, thread_par ); + } + + return thread_cur; } -#endif diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 3f8a3112b..93bf19e50 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -45,45 +45,59 @@ struct thrinfo_s // Our thread id within the ocomm thread communicator. dim_t ocomm_id; - // The thread communicator for the other threads sharing the same work - // at this level. - thrcomm_t* icomm; - - // Our thread id within the icomm thread communicator. - dim_t icomm_id; - // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; - struct thrinfo_s* opackm; - struct thrinfo_s* ipackm; - struct thrinfo_s* sub_self; + // When freeing, should the communicators in this node be freed? Usually, + // this is field is true, but when nodes are created that share the same + // communicators as other nodes (such as with packm nodes), this is set + // to false. + bool_t free_comm; + + struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; - -#define bli_thread_num_threads( t ) ( t->ocomm->n_threads ) - -#define bli_thread_n_way( t ) ( t->n_way ) -#define bli_thread_work_id( t ) ( t->work_id ) -#define bli_thread_am_ochief( t ) ( t->ocomm_id == 0 ) -#define bli_thread_am_ichief( t ) ( t->icomm_id == 0 ) - -#define bli_thread_obroadcast( t, ptr ) bli_thrcomm_bcast( t->ocomm, t->ocomm_id, ptr ) -#define bli_thread_ibroadcast( t, ptr ) bli_thrcomm_bcast( t->icomm, t->icomm_id, ptr ) -#define bli_thread_obarrier( t ) bli_thrcomm_barrier( t->ocomm, t->ocomm_id ) -#define bli_thread_ibarrier( t ) bli_thrcomm_barrier( t->icomm, t->icomm_id ) - // -// Generic accessor macros for all thrinfo_t objects. +// thrinfo_t macros +// NOTE: The naming of these should be made consistent at some point. +// (ie: bli_thrinfo_ vs. bli_thread_) // -#define bli_thrinfo_sub_opackm( t ) ( t->opackm ) -#define bli_thrinfo_sub_ipackm( t ) ( t->ipackm ) -#define bli_thrinfo_sub_self( t ) ( t->sub_self ) +// thrinfo_t query (field only) + +#define bli_thread_num_threads( t ) ( (t)->ocomm->n_threads ) + +#define bli_thread_n_way( t ) ( (t)->n_way ) +#define bli_thread_work_id( t ) ( (t)->work_id ) +#define bli_thread_ocomm_id( t ) ( (t)->ocomm_id ) + +#define bli_thrinfo_ocomm( t ) ( (t)->ocomm ) +#define bli_thrinfo_needs_free_comm( t ) ( (t)->free_comm ) + +#define bli_thrinfo_sub_node( t ) ( (t)->sub_node ) + +// thrinfo_t query (complex) + +#define bli_thread_am_ochief( t ) ( (t)->ocomm_id == 0 ) + +// thrinfo_t modification + +#define bli_thrinfo_set_sub_node( _sub_node, thread ) \ +{ \ + (thread)->sub_node = _sub_node; \ +} + +// other thrinfo_t-related macros + +#define bli_thread_obroadcast( t, p ) bli_thrcomm_bcast( (t)->ocomm, \ + (t)->ocomm_id, p ) +#define bli_thread_obarrier( t ) bli_thrcomm_barrier( (t)->ocomm, \ + (t)->ocomm_id ) + // // Prototypes for level-3 thrinfo functions not specific to any operation. @@ -93,13 +107,10 @@ thrinfo_t* bli_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comm, + thrinfo_t* sub_node ); void bli_thrinfo_init @@ -107,13 +118,10 @@ void bli_thrinfo_init thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, - thrcomm_t* icomm, - dim_t icomm_id, dim_t n_way, dim_t work_id, - thrinfo_t* opackm, - thrinfo_t* ipackm, - thrinfo_t* sub_self + bool_t free_comm, + thrinfo_t* sub_node ); void bli_thrinfo_init_single @@ -121,9 +129,29 @@ void bli_thrinfo_init_single thrinfo_t* thread ); -void bli_thrinfo_free +// ----------------------------------------------------------------------------- + +thrinfo_t* bli_thrinfo_create_for_cntl ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_chl, + thrinfo_t* thread_par + ); + +void bli_thrinfo_grow + ( + cntx_t* cntx, + cntl_t* cntl, thrinfo_t* thread ); +thrinfo_t* bli_thrinfo_rgrow + ( + cntx_t* cntx, + cntl_t* cntl_par, + cntl_t* cntl_cur, + thrinfo_t* thread_par + ); + #endif diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c index 760e869b8..7a471995d 100644 --- a/frame/util/bli_util_check.c +++ b/frame/util/bli_util_check.c @@ -38,21 +38,6 @@ // Define object-based check functions. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ) \ -{ \ - bli_utilv_xi_check( x, index ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ @@ -172,42 +157,6 @@ GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- -void bli_utilv_xi_check - ( - obj_t* x, - obj_t* index - ) -{ - err_t e_val; - - // Check object datatypes. - - e_val = bli_check_floating_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_integer_object( index ); - bli_check_error_code( e_val ); - - e_val = bli_check_nonconstant_object( index ); - bli_check_error_code( e_val ); - - // Check object dimensions. - - e_val = bli_check_vector_object( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_scalar_object( index ); - bli_check_error_code( e_val ); - - // Check object buffers (for non-NULLness). - - e_val = bli_check_object_buffer( x ); - bli_check_error_code( e_val ); - - e_val = bli_check_object_buffer( index ); - bli_check_error_code( e_val ); -} - void bli_utilv_xa_check ( obj_t* x, diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h index 0fb23bccd..364ab5923 100644 --- a/frame/util/bli_util_check.h +++ b/frame/util/bli_util_check.h @@ -37,18 +37,6 @@ // Prototype object-based check functions. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,_check) \ - ( \ - obj_t* x, \ - obj_t* index \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c index abac92b26..2942616c1 100644 --- a/frame/util/bli_util_oapi.c +++ b/frame/util/bli_util_oapi.c @@ -40,44 +40,6 @@ // Define object-based interfaces. // -#undef GENFRONT -#define GENFRONT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ) \ -{ \ - BLIS_OAPI_CNTX_DECL \ -\ - num_t dt = bli_obj_datatype( *x ); \ -\ - dim_t n = bli_obj_vector_dim( *x ); \ - void* buf_x = bli_obj_buffer_at_off( *x ); \ - inc_t incx = bli_obj_vector_inc( *x ); \ -\ - void* buf_index = bli_obj_buffer_at_off( *index ); \ -\ - if ( bli_error_checking_is_enabled() ) \ - PASTEMAC(opname,_check)( x, index ); \ -\ - /* Invoke the typed function. */ \ - bli_call_ft_5 \ - ( \ - dt, \ - opname, \ - n, \ - buf_x, incx, \ - buf_index, \ - cntx \ - ); \ -} - -GENFRONT( amaxv ) - - #undef GENFRONT #define GENFRONT( opname ) \ \ diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h index 9de0afadb..f669271fa 100644 --- a/frame/util/bli_util_oapi.h +++ b/frame/util/bli_util_oapi.h @@ -37,19 +37,6 @@ // Prototype object-based interfaces. // -#undef GENPROT -#define GENPROT( opname ) \ -\ -void PASTEMAC(opname,EX_SUF) \ - ( \ - obj_t* x, \ - obj_t* index \ - BLIS_OAPI_CNTX_PARAM \ - ); - -GENPROT( amaxv ) - - #undef GENPROT #define GENPROT( opname ) \ \ diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c index 8fa89d9ae..ad2bb0b40 100644 --- a/frame/util/bli_util_tapi.c +++ b/frame/util/bli_util_tapi.c @@ -38,50 +38,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCI -#define GENTFUNCI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ) \ -{ \ - cntx_t* cntx_p = cntx; \ -\ - /* If the vector length is zero, set the index to zero and return - early. This directly emulatess the behavior of netlib LAPACK's - i?amax() routines. */ \ - if ( bli_zero_dim1( n ) ) \ - { \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - PASTEMAC(chi,copys)( *zero_i, *index ); \ - return; \ - } \ -\ - /* Initialize a local context if the given context is NULL. */ \ - /*bli_cntx_init_local_if( opname, cntx, cntx_p );*/ \ -\ - /* Invoke the helper variant, which loops over the appropriate kernel - to implement the current operation. */ \ - PASTEMAC2(ch,opname,_unb_var1) \ - ( \ - n, \ - x, incx, \ - index, \ - cntx_p \ - ); \ -\ - /* Finalize the context if it was initialized locally. */ \ - /*bli_cntx_finalize_local_if( opname, cntx );*/ \ -} - -INSERT_GENTFUNCI_BASIC0( amaxv ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h index e7dbc73e7..1f3d48a7c 100644 --- a/frame/util/bli_util_tapi.h +++ b/frame/util/bli_util_tapi.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTI -#define GENTPROTI( ctype, ctype_i, ch, chi, opname ) \ -\ -void PASTEMAC(ch,opname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* index, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTI_BASIC( amaxv ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c index 1ed142a7c..73f17ba29 100644 --- a/frame/util/bli_util_unb_var1.c +++ b/frame/util/bli_util_unb_var1.c @@ -38,71 +38,6 @@ // Define BLAS-like interfaces with typed operands. // -#undef GENTFUNCRI -#define GENTFUNCRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ) \ -{ \ - ctype_r* minus_one = PASTEMAC(chr,m1); \ - ctype_i* zero_i = PASTEMAC(chi,0); \ -\ - ctype* chi1; \ - ctype_r chi1_r; \ - ctype_r chi1_i; \ - ctype_r abs_chi1; \ - ctype_r abs_chi1_max; \ - ctype_i i_max; \ - dim_t i; \ -\ - /* Initialize the index of the maximum absolute value to zero. */ \ - PASTEMAC(chi,copys)( *zero_i, i_max ); \ -\ - /* Initialize the maximum absolute value search candidate with - -1, which is guaranteed to be less than all values we will - compute. */ \ - PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ -\ - for ( i = 0; i < n; ++i ) \ - { \ - chi1 = x + (i )*incx; \ -\ - /* Get the real and imaginary components of chi1. */ \ - PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ -\ - /* Replace chi1_r and chi1_i with their absolute values. */ \ - PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ - PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ -\ - /* Add the real and imaginary absolute values together. */ \ - PASTEMAC(chr,set0s)( abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ - PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ -\ - /* If the absolute value of the current element exceeds that of - the previous largest, save it and its index. If NaN is - encountered, then treat it the same as if it were a valid - value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ \ - if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ - { \ - PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \ - PASTEMAC(chi,copys)( i, i_max ); \ - } \ - } \ -\ - /* Store final index to output variable. */ \ - PASTEMAC(chi,copys)( i_max, *abmax_i ); \ -} - -INSERT_GENTFUNCRI_BASIC0( amaxv_unb_var1 ) - - #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h index 369f5f650..09ca31d76 100644 --- a/frame/util/bli_util_unb_var1.h +++ b/frame/util/bli_util_unb_var1.h @@ -37,20 +37,6 @@ // Prototype BLAS-like interfaces with typed operands. // -#undef GENTPROTRI -#define GENTPROTRI( ctype, ctype_r, ctype_i, ch, chr, chi, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - dim_t n, \ - ctype* x, inc_t incx, \ - ctype_i* abmax_i, \ - cntx_t* cntx \ - ); - -INSERT_GENTPROTRI_BASIC( amaxv_unb_var1 ) - - #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ diff --git a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c index 4aad807d2..fc7f750b4 100644 --- a/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c +++ b/kernels/x86_64/bulldozer/3/bli_gemm_asm_d4x6_fma4.c @@ -763,7 +763,7 @@ void bli_sgemm_asm_8x8_fma4 #undef KERNEL4x6_4 #define KERNEL4x6_1(xx) \ - ".align 4 \n\t"\ + ".p2align 2 \n\t"\ "vmovddup -8 * 8(%%rax), %%xmm0 \n\t"\ "vfmaddpd %%xmm4, %%xmm1, %%xmm0, %%xmm4 \n\t"\ "vfmaddpd %%xmm5, %%xmm2, %%xmm0, %%xmm5 \n\t"\ @@ -888,7 +888,7 @@ void bli_dgemm_asm_4x6_fma4 "testq %%rsi, %%rsi \n\t" "je .CONSIDERKLEFT \n\t" " \n\t" - ".align 32 \n\t" + ".p2align 5 \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" KERNEL4x6_1(xx) diff --git a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c index bc06c819b..bee1df996 100644 --- a/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c +++ b/kernels/x86_64/haswell/3/bli_gemm_asm_d6x8.c @@ -249,7 +249,7 @@ void bli_sgemm_asm_6x16 " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" + "prefetcht0 64 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" @@ -329,7 +329,7 @@ void bli_sgemm_asm_6x16 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" @@ -474,8 +474,8 @@ void bli_sgemm_asm_6x16 " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" @@ -622,6 +622,8 @@ void bli_sgemm_asm_6x16 } + + #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ @@ -910,7 +912,7 @@ void bli_dgemm_asm_6x8 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" @@ -1053,8 +1055,8 @@ void bli_dgemm_asm_6x8 " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" @@ -1197,9 +1199,41 @@ void bli_dgemm_asm_6x8 ); } -#if 0 -void bli_cgemm_asm_ + + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define CGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define CGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ + "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" + +#define CGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define CGEMM_OUTPUT_RS \ + "vmovups %%ymm0, (%%rcx) \n\t" \ + +void bli_cgemm_asm_3x8 ( dim_t k, scomplex* restrict alpha, @@ -1214,14 +1248,515 @@ void bli_cgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(scomplex) + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".CLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 8, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".CLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" + "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" + "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 8, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".CPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" + "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" + "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" + "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(scomplex) + "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*cs_c; + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORED: \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; + " \n\t" + " \n\t" + CGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" + CGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CBETAZERO: \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".CGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + CGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .CDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".CROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".CDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -void bli_zgemm_asm_ + +// assumes beta.r, beta.i have been broadcast into ymm1, ymm2. +// outputs to ymm0 +#define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ + "vmovupd (%%rcx), %%xmm0 \n\t" \ + "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ + "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +// assumes values to output are in ymm0 +#define ZGEMM_OUTPUT_GS \ + "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ + "vmovupd %%xmm0, (%%rcx) \n\t" \ + "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ + +#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ + "vmovups (%%rcx), %%ymm0 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" + +#define ZGEMM_OUTPUT_RS \ + "vmovupd %%ymm0, (%%rcx) \n\t" \ + +void bli_zgemm_asm_3x4 ( dim_t k, dcomplex* restrict alpha, @@ -1236,9 +1771,484 @@ void bli_zgemm_asm_ //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + uint64_t k_iter = k / 4; + uint64_t k_left = k % 4; + //uint64_t alpha_is_unit = bli_zeq1( *alpha ); + + + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(dcomplex) + "leaq (,%%rdi,2), %%rdi \n\t" + " \n\t" + "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; + "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; + " \n\t" + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 38 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr) + "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".ZLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 32 * 16(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr) + "addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".ZPOSTACCUM: \n\t" + " \n\t" + " \n\t" // permute even and odd elements + " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 + "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" + "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" // subtract/add even/odd elements + "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" + "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" + " \n\t" + "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" + "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" + "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate + "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" + " \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" + " \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate + "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(dcomplex) + "leaq (,%%rsi,2), %%rsi \n\t" + "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. + "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); + "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. + "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); + "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. + "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_GS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORED: \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + ZGEMM_INPUT_SCALE_RS_BETA_NZ + "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" + ZGEMM_OUTPUT_RS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZBETAZERO: \n\t" + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".ZGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c + " \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + ZGEMM_OUTPUT_GS + " \n\t" + " \n\t" + " \n\t" + "jmp .ZDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".ZROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm8, (%%r11) \n\t" + "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" + " \n\t" + "vmovups %%ymm12, (%%r12) \n\t" + "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".ZDONE: \n\t" + " \n\t" + + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } -#endif diff --git a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c index 0b017fbcd..f8db398ca 100644 --- a/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c +++ b/kernels/x86_64/sandybridge/3/bli_gemm_asm_d8x4.c @@ -414,23 +414,6 @@ void bli_sgemm_asm_8x8 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 4*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -438,10 +421,8 @@ void bli_sgemm_asm_8x8 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORED \n\t" // jump to column storage case + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -740,52 +721,52 @@ void bli_sgemm_asm_8x8 ".SCOLSTORED: \n\t" " \n\t" " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70, + "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71, + "vmovups (%%rcx), %%ymm1 \n\t" // load c01:c71, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72, + "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73, + "vmovups (%%rcx), %%ymm1 \n\t" // load c03:c73, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74, + "vmovups (%%rcx), %%ymm0 \n\t" // load c04:c74, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75, + "vmovups (%%rcx), %%ymm1 \n\t" // load c05:c75, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76, + "vmovups (%%rcx), %%ymm0 \n\t" // load c06:c76, "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77, + "vmovups (%%rcx), %%ymm1 \n\t" // load c07:c77, "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta, "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result, - "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm1, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. @@ -794,17 +775,16 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" ".SBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .SCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. + "jz .SCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" // update c00:c70 - "vmovapd %%ymm15, %%ymm0 \n\t" + "vmovups %%ymm15, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -826,7 +806,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c01:c71 - "vmovapd %%ymm14, %%ymm0 \n\t" + "vmovups %%ymm14, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -848,7 +828,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c02:c72 - "vmovapd %%ymm13, %%ymm0 \n\t" + "vmovups %%ymm13, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -870,7 +850,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c03:c73 - "vmovapd %%ymm12, %%ymm0 \n\t" + "vmovups %%ymm12, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -892,7 +872,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c04:c74 - "vmovapd %%ymm11, %%ymm0 \n\t" + "vmovups %%ymm11, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -914,7 +894,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c05:c75 - "vmovapd %%ymm10, %%ymm0 \n\t" + "vmovups %%ymm10, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -936,7 +916,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c06:c76 - "vmovapd %%ymm9, %%ymm0 \n\t" + "vmovups %%ymm9, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -958,7 +938,7 @@ void bli_sgemm_asm_8x8 " \n\t" " \n\t" " \n\t" // update c07:c77 - "vmovapd %%ymm8, %%ymm0 \n\t" + "vmovups %%ymm8, %%ymm0 \n\t" "vextractf128 $1, %%ymm0, %%xmm2 \n\t" "vmovss %%xmm0, (%%rcx) \n\t" "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" @@ -983,28 +963,28 @@ void bli_sgemm_asm_8x8 ".SCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm15, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm14, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm13, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm12, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm11, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm10, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm9, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory. + "vmovups %%ymm8, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" " \n\t" @@ -1378,23 +1358,6 @@ void bli_dgemm_asm_8x4 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -1402,10 +1365,8 @@ void bli_dgemm_asm_8x4 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORED \n\t" // jump to column storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -1540,53 +1501,53 @@ void bli_dgemm_asm_8x4 ".DCOLSTORED: \n\t" " \n\t" // update c00:c33 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33, + "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rcx) \n\t" // and store back to memory. " \n\t" " \n\t" // update c40:c73 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73, + "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73, "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta, "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result, - "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory. + "vmovupd %%ymm0, (%%rdx) \n\t" // and store back to memory. " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. @@ -1595,10 +1556,9 @@ void bli_dgemm_asm_8x4 " \n\t" " \n\t" ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -1669,29 +1629,29 @@ void bli_dgemm_asm_8x4 ".DCOLSTORBZ: \n\t" " \n\t" // update c00:c33 " \n\t" - "vmovapd %%ymm9, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm9, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm11, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm11, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm13, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm15, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm15, (%%rcx) \n\t" // store c03:c33 " \n\t" " \n\t" // update c40:c73 " \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm8, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm10, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm10, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm12, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm12, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm14, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" @@ -2260,23 +2220,6 @@ void bli_cgemm_asm_8x4 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -2288,10 +2231,8 @@ void bli_cgemm_asm_8x4 "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORED \n\t" // jump to column storage case + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -2459,90 +2400,90 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" // update c00:c70 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c00:c70 + "vmovups %%ymm0, (%%rcx) \n\t" // store c00:c70 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c80:cf0 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c80:cf0 + "vmovups %%ymm0, (%%rdx) \n\t" // store c80:cf0 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c00:c70 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c01:c71 + "vmovups %%ymm0, (%%rcx) \n\t" // store c01:c71 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c81:cf1 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c81:cf1 + "vmovups %%ymm0, (%%rdx) \n\t" // store c81:cf1 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c02:c72 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c02:c72 + "vmovups %%ymm0, (%%rcx) \n\t" // store c02:c72 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c82:cf2 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c82:cf2 + "vmovups %%ymm0, (%%rdx) \n\t" // store c82:cf2 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c03:c73 " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 + "vmovups (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rcx) \n\t" // store c03:c73 + "vmovups %%ymm0, (%%rcx) \n\t" // store c03:c73 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c83:cf3 " \n\t" - "vmovaps (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 + "vmovups (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovaps %%ymm0, (%%rdx) \n\t" // store c83:cf3 + "vmovups %%ymm0, (%%rdx) \n\t" // store c83:cf3 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" @@ -2552,11 +2493,9 @@ void bli_cgemm_asm_8x4 " \n\t" " \n\t" ".CBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .CCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .CCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -2643,28 +2582,28 @@ void bli_cgemm_asm_8x4 ".CCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovaps %%ymm15, (%%rcx) \n\t" // store c00:c70 + "vmovups %%ymm15, (%%rcx) \n\t" // store c00:c70 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm14, (%%rdx) \n\t" // store c80:cf0 + "vmovups %%ymm14, (%%rdx) \n\t" // store c80:cf0 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm13, (%%rcx) \n\t" // store c01:c71 + "vmovups %%ymm13, (%%rcx) \n\t" // store c01:c71 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm12, (%%rdx) \n\t" // store c81:cf1 + "vmovups %%ymm12, (%%rdx) \n\t" // store c81:cf1 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm11, (%%rcx) \n\t" // store c02:c72 + "vmovups %%ymm11, (%%rcx) \n\t" // store c02:c72 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm10, (%%rdx) \n\t" // store c82:cf2 + "vmovups %%ymm10, (%%rdx) \n\t" // store c82:cf2 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm9, (%%rcx) \n\t" // store c03:c73 + "vmovups %%ymm9, (%%rcx) \n\t" // store c03:c73 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovaps %%ymm8, (%%rdx) \n\t" // store c83:cf3 + "vmovups %%ymm8, (%%rdx) \n\t" // store c83:cf3 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" @@ -3178,26 +3117,6 @@ void bli_zgemm_asm_4x4 "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c; " \n\t" " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 16*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (16*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. @@ -3209,10 +3128,8 @@ void bli_zgemm_asm_4x4 "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case " \n\t" " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORED \n\t" // jump to column storage case + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -3345,90 +3262,90 @@ void bli_zgemm_asm_4x4 ".ZCOLSTORED: \n\t" " \n\t" // update c00:c30 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c40:c70 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c01:c31 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c41:c71 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c02:c32 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c42:c72 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" " \n\t" // update c03:c33 " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 + "vmovupd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm0, (%%rcx) \n\t" // store c03:c33 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" // update c43:c73 " \n\t" - "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 + "vmovupd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t" "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t" "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0 - "vmovapd %%ymm0, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm0, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" @@ -3437,11 +3354,9 @@ void bli_zgemm_asm_4x4 " \n\t" " \n\t" ".ZBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .ZCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. + "jz .ZCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" @@ -3510,28 +3425,28 @@ void bli_zgemm_asm_4x4 ".ZCOLSTORBZ: \n\t" " \n\t" " \n\t" - "vmovapd %%ymm15, (%%rcx) \n\t" // store c00:c30 + "vmovupd %%ymm15, (%%rcx) \n\t" // store c00:c30 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" // store c40:c70 + "vmovupd %%ymm14, (%%rdx) \n\t" // store c40:c70 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" // store c01:c31 + "vmovupd %%ymm13, (%%rcx) \n\t" // store c01:c31 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm12, (%%rdx) \n\t" // store c41:c71 + "vmovupd %%ymm12, (%%rdx) \n\t" // store c41:c71 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm11, (%%rcx) \n\t" // store c02:c32 + "vmovupd %%ymm11, (%%rcx) \n\t" // store c02:c32 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm10, (%%rdx) \n\t" // store c42:c72 + "vmovupd %%ymm10, (%%rdx) \n\t" // store c42:c72 "addq %%rdi, %%rdx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm9, (%%rcx) \n\t" // store c03:c33 + "vmovupd %%ymm9, (%%rcx) \n\t" // store c03:c33 "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" // store c43:c73 + "vmovupd %%ymm8, (%%rdx) \n\t" // store c43:c73 " \n\t" " \n\t" " \n\t" diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 0a45266d9..c8e9ec5d5 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -79,14 +79,15 @@ int main( int argc, char** argv ) k_input = -1; #if 0 - extern blksz_t* gemm_kc; + num_t dt_real = bli_datatype_proj_to_real( DT ); + cntx_t cntx; - num_t dt_real = bli_datatype_proj_to_real( DT ); + bli_gemm_cntx_init( &cntx ); // Extract the kc blocksize for the requested datatype and its // real analogue. - dim_t kc = bli_blksz_get_def( dt, gemm_kc ); - dim_t kc_real = bli_blksz_get_def( dt_real, gemm_kc ); + dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); + dim_t kc_real = bli_cntx_get_blksz_def_dt( dt_real, BLIS_KC, &cntx ); // Assign the k dimension depending on which implementation is // being tested. Note that the BLIS_NAT case handles the real @@ -163,7 +164,7 @@ int main( int argc, char** argv ) bli_ind_enable_dt( IND, dt ); #endif - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_gemm.c b/test/test_gemm.c index 7d5ac6a9c..dd46b5237 100644 --- a/test/test_gemm.c +++ b/test/test_gemm.c @@ -127,7 +127,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_gemv.c b/test/test_gemv.c index 14aa3b87b..cab20d0de 100644 --- a/test/test_gemv.c +++ b/test/test_gemv.c @@ -107,7 +107,7 @@ int main( int argc, char** argv ) bli_copym( &y, &y_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_ger.c b/test/test_ger.c index a56da98d1..8564f1cfd 100644 --- a/test/test_ger.c +++ b/test/test_ger.c @@ -105,7 +105,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_hemm.c b/test/test_hemm.c index 3844f5623..03969893e 100644 --- a/test/test_hemm.c +++ b/test/test_hemm.c @@ -146,7 +146,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_hemv.c b/test/test_hemv.c index 81593e9f0..ab7ae941e 100644 --- a/test/test_hemv.c +++ b/test/test_hemv.c @@ -114,7 +114,7 @@ int main( int argc, char** argv ) bli_copym( &y, &y_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her.c b/test/test_her.c index a3df5faa1..44cb91b6e 100644 --- a/test/test_her.c +++ b/test/test_her.c @@ -111,7 +111,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her2.c b/test/test_her2.c index 1ee954c07..7bb27b106 100644 --- a/test/test_her2.c +++ b/test/test_her2.c @@ -112,7 +112,7 @@ int main( int argc, char** argv ) bli_copym( &a, &a_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_her2k.c b/test/test_her2k.c index 3f1de8bbf..0204051a1 100644 --- a/test/test_her2k.c +++ b/test/test_her2k.c @@ -137,7 +137,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_herk.c b/test/test_herk.c index bbad4e5d4..a3ac93adb 100644 --- a/test/test_herk.c +++ b/test/test_herk.c @@ -129,7 +129,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trmm.c b/test/test_trmm.c index f75855923..e72028153 100644 --- a/test/test_trmm.c +++ b/test/test_trmm.c @@ -144,7 +144,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trmv.c b/test/test_trmv.c index 2fdb49fdc..6eb089f2a 100644 --- a/test/test_trmv.c +++ b/test/test_trmv.c @@ -108,7 +108,7 @@ int main( int argc, char** argv ) bli_copym( &x, &x_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trsm.c b/test/test_trsm.c index ba8cf3bb6..f23e4de12 100644 --- a/test/test_trsm.c +++ b/test/test_trsm.c @@ -144,7 +144,7 @@ int main( int argc, char** argv ) bli_copym( &c, &c_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/test/test_trsv.c b/test/test_trsv.c index 1a8777aca..c61edaf03 100644 --- a/test/test_trsv.c +++ b/test/test_trsv.c @@ -107,7 +107,7 @@ int main( int argc, char** argv ) bli_copym( &x, &x_save ); - dtime_save = 1.0e9; + dtime_save = DBL_MAX; for ( r = 0; r < n_repeats; ++r ) { diff --git a/testsuite/input.general b/testsuite/input.general index b9940dac3..0bf9053bd 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -36,6 +36,7 @@ sdcz # Datatype(s) to test: 1 # 4mh ('1' = enable; '0' = disable) 1 # 4m1b ('1' = enable; '0' = disable) 1 # 4m1a ('1' = enable; '0' = disable) +1 # native ('1' = enable; '0' = disable) 1 # Error-checking level: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: diff --git a/testsuite/input.operations b/testsuite/input.operations index 058721632..ac9298f8b 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -107,6 +107,10 @@ -1 # dimensions: m ? # parameters: conjx +1 # amaxv +1 # test sequential front-end +-1 # dimensions: m + 1 # axpbyv 1 # test sequential front-end -1 # dimensions: m diff --git a/testsuite/src/test_addm.c b/testsuite/src/test_addm.c index 92dbca677..fe0f3172a 100644 --- a/testsuite/src/test_addm.c +++ b/testsuite/src/test_addm.c @@ -142,7 +142,7 @@ void libblis_test_addm_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_addv.c b/testsuite/src/test_addv.c index 3e8225892..36067b7fc 100644 --- a/testsuite/src/test_addv.c +++ b/testsuite/src/test_addv.c @@ -141,7 +141,7 @@ void libblis_test_addv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_amaxv.c b/testsuite/src/test_amaxv.c new file mode 100644 index 000000000..9323ecbba --- /dev/null +++ b/testsuite/src/test_amaxv.c @@ -0,0 +1,400 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "test_libblis.h" + + +// Static variables. +static char* op_str = "amaxv"; +static char* o_types = "v"; // x +static char* p_types = ""; // (no parameters) +static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s + { 1e-04, 1e-05 }, // warn, pass for c + { 1e-13, 1e-14 }, // warn, pass for d + { 1e-13, 1e-14 } }; // warn, pass for z + +// Local prototypes. +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ); + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ); + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ); + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ); + +void bli_amaxv_test + ( + obj_t* x, + obj_t* index + ); + + + +void libblis_test_amaxv_deps + ( + test_params_t* params, + test_op_t* op + ) +{ + libblis_test_randv( params, &(op->ops->randv) ); +} + + + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ) +{ + + // Return early if this test has already been done. + if ( op->test_done == TRUE ) return; + + // Return early if operation is disabled. + if ( op->op_switch == DISABLE_ALL || + op->ops->l1v_over == DISABLE_ALL ) return; + + // Call dependencies first. + if ( TRUE ) libblis_test_amaxv_deps( params, op ); + + // Execute the test driver for each implementation requested. + if ( op->front_seq == ENABLE ) + { + libblis_test_op_driver( params, + op, + BLIS_TEST_SEQ_FRONT_END, + op_str, + p_types, + o_types, + thresh, + libblis_test_amaxv_experiment ); + } +} + + + +void libblis_test_amaxv_experiment + ( + test_params_t* params, + test_op_t* op, + iface_t iface, + num_t datatype, + char* pc_str, + char* sc_str, + unsigned int p_cur, + double* perf, + double* resid + ) +{ + unsigned int n_repeats = params->n_repeats; + unsigned int i; + + double time_min = DBL_MAX; + double time; + + dim_t m; + + obj_t x; + obj_t index; + + + // Map the dimension specifier to an actual dimension. + m = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); + + // Map parameter characters to BLIS constants. + + + // Create test scalars. + bli_obj_scalar_init_detached( BLIS_INT, &index ); + + // Create test operands (vectors and/or matrices). + libblis_test_vobj_create( params, datatype, sc_str[0], m, &x ); + + // Randomize x. + libblis_test_vobj_randomize( params, FALSE, &x ); + + // Repeat the experiment n_repeats times and record results. + for ( i = 0; i < n_repeats; ++i ) + { + time = bli_clock(); + + libblis_test_amaxv_impl( iface, &x, &index ); + + time_min = bli_clock_min_diff( time_min, time ); + } + + // Estimate the performance of the best experiment repeat. + *perf = ( 1.0 * m ) / time_min / FLOPS_PER_UNIT_PERF; + if ( bli_obj_is_complex( x ) ) *perf *= 2.0; + + // Perform checks. + libblis_test_amaxv_check( params, &x, &index, resid ); + + // Zero out performance and residual if input vector is empty. + libblis_test_check_empty_problem( &x, perf, resid ); + + // Free the test objects. + bli_obj_free( &x ); +} + + + +void libblis_test_amaxv_impl + ( + iface_t iface, + obj_t* x, + obj_t* index + ) +{ + switch ( iface ) + { + case BLIS_TEST_SEQ_FRONT_END: + bli_amaxv( x, index ); + break; + + default: + libblis_test_printf_error( "Invalid interface type.\n" ); + } +} + + + +void libblis_test_amaxv_check + ( + test_params_t* params, + obj_t* x, + obj_t* index, + double* resid + ) +{ + obj_t index_test; + obj_t chi_i; + obj_t chi_i_test; + dim_t i; + dim_t i_test; + + double i_d, junk; + double i_d_test; + + // + // Pre-conditions: + // - x is randomized. + // + // Under these conditions, we assume that the implementation for + // + // index := amaxv( x ) + // + // is functioning correctly if + // + // x[ index ] = max( x ) + // + // where max() is implemented via the bli_?amaxv_test() function. + // + + // The following two calls have already been made by the caller. That + // is, the index object has already been created and the library's + // amaxv implementation has already been tested. + //bli_obj_scalar_init_detached( BLIS_INT, &index ); + //bli_amaxv( x, &index ); + bli_getsc( index, &i_d, &junk ); i = i_d; + bli_acquire_vi( i, x, &chi_i ); + + bli_obj_scalar_init_detached( BLIS_INT, &index_test ); + bli_amaxv_test( x, &index_test ); + bli_getsc( &index_test, &i_d_test, &junk ); i_test = i_d_test; + bli_acquire_vi( i_test, x, &chi_i_test ); + + // Verify that the values referenced by index and index_test are equal. + if ( bli_obj_equals( &chi_i, &chi_i_test ) ) *resid = 0.0; + else *resid = 1.0; +} + +// ----------------------------------------------------------------------------- + +// +// Prototype BLAS-like interfaces with typed operands for a local amaxv test +// operation +// + +#undef GENTPROT +#define GENTPROT( ctype, ch, opname ) \ +\ +void PASTEMAC(ch,opname) \ + ( \ + dim_t n, \ + ctype* restrict x, inc_t incx, \ + dim_t* restrict index, \ + cntx_t* cntx \ + ); \ + +INSERT_GENTPROT_BASIC( amaxv_test ) + +// +// Define object-based interface for a local amaxv test operation. +// + +#undef GENFRONT +#define GENFRONT( opname ) \ +\ +void PASTEMAC0(opname) \ + ( \ + obj_t* x, \ + obj_t* index \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *x ); \ +\ + dim_t n = bli_obj_vector_dim( *x ); \ + void* buf_x = bli_obj_buffer_at_off( *x ); \ + inc_t incx = bli_obj_vector_inc( *x ); \ +\ + void* buf_index = bli_obj_buffer_at_off( *index ); \ +\ + if ( bli_error_checking_is_enabled() ) \ + bli_amaxv_check( x, index ); \ +\ + /* Invoke the bli_?amaxv_test() function. */ \ + bli_call_ft_5 \ + ( \ + dt, \ + amaxv_test, \ + n, \ + buf_x, incx, \ + buf_index, \ + NULL \ + ); \ +} + +GENFRONT( amaxv_test ) + +// +// Define BLAS-like interfaces with typed operands for a local amaxv test +// operation. +// NOTE: This is based on a simplified version of the bli_?amaxv_ref() +// reference kernel. +// + +#undef GENTFUNCR +#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t n, \ + ctype* x, inc_t incx, \ + dim_t* i_max, \ + cntx_t* cntx \ + ) \ +{ \ + ctype_r* minus_one = PASTEMAC(chr,m1); \ + dim_t* zero_i = PASTEMAC(i,0); \ +\ + ctype_r chi1_r; \ + ctype_r chi1_i; \ + ctype_r abs_chi1; \ + ctype_r abs_chi1_max; \ + dim_t i; \ +\ + /* Initialize the index of the maximum absolute value to zero. */ \ + PASTEMAC(i,copys)( zero_i, *i_max ); \ +\ + /* If the vector length is zero, return early. This directly emulates + the behavior of netlib BLAS's i?amax() routines. */ \ + if ( bli_zero_dim1( n ) ) return; \ +\ + /* Initialize the maximum absolute value search candidate with + -1, which is guaranteed to be less than all values we will + compute. */ \ + PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ +\ + { \ + for ( i = 0; i < n; ++i ) \ + { \ + ctype* chi1 = x + (i )*incx; \ +\ + /* Get the real and imaginary components of chi1. */ \ + PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ +\ + /* Replace chi1_r and chi1_i with their absolute values. */ \ + PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ + PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ +\ + /* Add the real and imaginary absolute values together. */ \ + PASTEMAC(chr,set0s)( abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ + PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ +\ + /* If the absolute value of the current element exceeds that of + the previous largest, save it and its index. If NaN is + encountered, then treat it the same as if it were a valid + value that was smaller than any previously seen. This + behavior mimics that of LAPACK's ?lange(). */ \ + if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + { \ + abs_chi1_max = abs_chi1; \ + *i_max = i; \ + } \ + } \ + } \ +} + +INSERT_GENTFUNCR_BASIC0( amaxv_test ) + diff --git a/testsuite/src/test_amaxv.h b/testsuite/src/test_amaxv.h new file mode 100644 index 000000000..364b27963 --- /dev/null +++ b/testsuite/src/test_amaxv.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +void libblis_test_amaxv + ( + test_params_t* params, + test_op_t* op + ); + diff --git a/testsuite/src/test_axpbyv.c b/testsuite/src/test_axpbyv.c index a3b030784..ff05a0b42 100644 --- a/testsuite/src/test_axpbyv.c +++ b/testsuite/src/test_axpbyv.c @@ -155,7 +155,7 @@ void libblis_test_axpbyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c index 7a67c71a8..6f5515127 100644 --- a/testsuite/src/test_axpy2v.c +++ b/testsuite/src/test_axpy2v.c @@ -155,7 +155,7 @@ void libblis_test_axpy2v_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c index 3a67f18b2..706359ca4 100644 --- a/testsuite/src/test_axpyf.c +++ b/testsuite/src/test_axpyf.c @@ -153,7 +153,7 @@ void libblis_test_axpyf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_axpym.c b/testsuite/src/test_axpym.c index ae8903fab..896373ed1 100644 --- a/testsuite/src/test_axpym.c +++ b/testsuite/src/test_axpym.c @@ -150,7 +150,7 @@ void libblis_test_axpym_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_axpyv.c b/testsuite/src/test_axpyv.c index 779c40ac5..472798b85 100644 --- a/testsuite/src/test_axpyv.c +++ b/testsuite/src/test_axpyv.c @@ -150,7 +150,7 @@ void libblis_test_axpyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_copym.c b/testsuite/src/test_copym.c index a2c023a4c..6993fd302 100644 --- a/testsuite/src/test_copym.c +++ b/testsuite/src/test_copym.c @@ -141,7 +141,7 @@ void libblis_test_copym_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_copyv.c b/testsuite/src/test_copyv.c index 13fd6c01c..5029227d6 100644 --- a/testsuite/src/test_copyv.c +++ b/testsuite/src/test_copyv.c @@ -141,7 +141,7 @@ void libblis_test_copyv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c index bf573f71a..36b88cc2f 100644 --- a/testsuite/src/test_dotaxpyv.c +++ b/testsuite/src/test_dotaxpyv.c @@ -157,7 +157,7 @@ void libblis_test_dotaxpyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotv.c b/testsuite/src/test_dotv.c index 1fa11fcfb..ece73cdb2 100644 --- a/testsuite/src/test_dotv.c +++ b/testsuite/src/test_dotv.c @@ -146,7 +146,7 @@ void libblis_test_dotv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c index 81f78d94b..dd83dc49e 100644 --- a/testsuite/src/test_dotxaxpyf.c +++ b/testsuite/src/test_dotxaxpyf.c @@ -163,7 +163,7 @@ void libblis_test_dotxaxpyf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c index 95009c4ee..3a29b41b7 100644 --- a/testsuite/src/test_dotxf.c +++ b/testsuite/src/test_dotxf.c @@ -155,7 +155,7 @@ void libblis_test_dotxf_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, b_n; diff --git a/testsuite/src/test_dotxv.c b/testsuite/src/test_dotxv.c index c0ed77b55..e394cf0ac 100644 --- a/testsuite/src/test_dotxv.c +++ b/testsuite/src/test_dotxv.c @@ -151,7 +151,7 @@ void libblis_test_dotxv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index aaa9bc408..222dca395 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -157,7 +157,7 @@ void libblis_test_gemm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c index d1877d6d6..514fdf66a 100644 --- a/testsuite/src/test_gemm_ukr.c +++ b/testsuite/src/test_gemm_ukr.c @@ -158,7 +158,7 @@ void libblis_test_gemm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; @@ -220,30 +220,28 @@ void libblis_test_gemm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_KR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_KR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a and b to ap and bp, respectively. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_KR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_KR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) @@ -269,9 +267,10 @@ void libblis_test_gemm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index 5f0babc07..afd436d7f 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -177,7 +177,7 @@ void libblis_test_gemmtrsm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n, k; @@ -260,39 +260,34 @@ void libblis_test_gemmtrsm_ukr_experiment bli_copym( &b11, &c11 ); bli_copym( &c11, &c11_save ); - - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); - - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _make_subparts() routine needs this information // to know how to initialize the subpartitions. bli_obj_set_uplo( uploa, ap ); - // Create subpartitions from the a and b panels. bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &a1xp, &a11p, &bx1p, &b11p ); @@ -302,14 +297,13 @@ void libblis_test_gemmtrsm_ukr_experiment // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, a11p ); - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { bli_copym( &c11_save, &c11 ); // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); time = bli_clock(); @@ -331,9 +325,10 @@ void libblis_test_gemmtrsm_ukr_experiment // Zero out performance and residual if output matrix is empty. //libblis_test_check_empty_problem( &c11, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a_big ); diff --git a/testsuite/src/test_gemv.c b/testsuite/src/test_gemv.c index 71427932c..b254a861c 100644 --- a/testsuite/src/test_gemv.c +++ b/testsuite/src/test_gemv.c @@ -154,7 +154,7 @@ void libblis_test_gemv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_ger.c b/testsuite/src/test_ger.c index 4a23a02c0..fc7944f52 100644 --- a/testsuite/src/test_ger.c +++ b/testsuite/src/test_ger.c @@ -152,7 +152,7 @@ void libblis_test_ger_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_hemm.c b/testsuite/src/test_hemm.c index 5f291a9c1..1b4231ba8 100644 --- a/testsuite/src/test_hemm.c +++ b/testsuite/src/test_hemm.c @@ -160,7 +160,7 @@ void libblis_test_hemm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_hemv.c b/testsuite/src/test_hemv.c index f786e32e4..6ab6fa11f 100644 --- a/testsuite/src/test_hemv.c +++ b/testsuite/src/test_hemv.c @@ -155,7 +155,7 @@ void libblis_test_hemv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her.c b/testsuite/src/test_her.c index 5199b5715..37ec26c1d 100644 --- a/testsuite/src/test_her.c +++ b/testsuite/src/test_her.c @@ -152,7 +152,7 @@ void libblis_test_her_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her2.c b/testsuite/src/test_her2.c index 0778ce514..d3660d7c2 100644 --- a/testsuite/src/test_her2.c +++ b/testsuite/src/test_her2.c @@ -154,7 +154,7 @@ void libblis_test_her2_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_her2k.c b/testsuite/src/test_her2k.c index 1a1759f2c..95d0dbf72 100644 --- a/testsuite/src/test_her2k.c +++ b/testsuite/src/test_her2k.c @@ -158,7 +158,7 @@ void libblis_test_her2k_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_herk.c b/testsuite/src/test_herk.c index 39d16fbf3..37853efb7 100644 --- a/testsuite/src/test_herk.c +++ b/testsuite/src/test_herk.c @@ -156,7 +156,7 @@ void libblis_test_herk_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index b86772361..bd14d13b4 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -113,6 +113,7 @@ void libblis_test_utility_ops( test_params_t* params, test_ops_t* ops ) void libblis_test_level1v_ops( test_params_t* params, test_ops_t* ops ) { libblis_test_addv( params, &(ops->addv) ); + libblis_test_amaxv( params, &(ops->amaxv) ); libblis_test_axpbyv( params, &(ops->axpbyv) ); libblis_test_axpyv( params, &(ops->axpyv) ); libblis_test_copyv( params, &(ops->copyv) ); @@ -222,6 +223,7 @@ void libblis_test_read_ops_file( char* input_filename, test_ops_t* ops ) // Level-1v libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->addv) ); + libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 0, &(ops->amaxv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpbyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->axpyv) ); libblis_test_read_op_info( ops, input_stream, BLIS_NOID, BLIS_TEST_DIMS_M, 1, &(ops->copyv) ); @@ -425,7 +427,9 @@ void libblis_test_read_params_file( char* input_filename, test_params_t* params libblis_test_read_next_line( buffer, input_stream ); sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_4M1A ]) ); - params->ind_enable[ BLIS_NAT ] = 1; + // Read whether to native (complex) execution. + libblis_test_read_next_line( buffer, input_stream ); + sscanf( buffer, "%u ", &(params->ind_enable[ BLIS_NAT ]) ); // Read the requested error-checking level. libblis_test_read_next_line( buffer, input_stream ); @@ -943,7 +947,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "problem size: first to test %u\n", params->p_first ); libblis_test_fprintf_c( os, "problem size: max to test %u\n", params->p_max ); libblis_test_fprintf_c( os, "problem size increment %u\n", params->p_inc ); - libblis_test_fprintf_c( os, "test induced complex \n" ); + libblis_test_fprintf_c( os, "complex implementations \n" ); libblis_test_fprintf_c( os, " 3mh? %u\n", params->ind_enable[ BLIS_3MH ] ); libblis_test_fprintf_c( os, " 3m3? %u\n", params->ind_enable[ BLIS_3M3 ] ); libblis_test_fprintf_c( os, " 3m2? %u\n", params->ind_enable[ BLIS_3M2 ] ); @@ -951,7 +955,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, " 4mh? %u\n", params->ind_enable[ BLIS_4MH ] ); libblis_test_fprintf_c( os, " 4m1b (4mb)? %u\n", params->ind_enable[ BLIS_4M1B ] ); libblis_test_fprintf_c( os, " 4m1a (4m1)? %u\n", params->ind_enable[ BLIS_4M1A ] ); - libblis_test_fprintf_c( os, "test native complex? %u\n", params->ind_enable[ BLIS_NAT ] ); + libblis_test_fprintf_c( os, " native? %u\n", params->ind_enable[ BLIS_NAT ] ); libblis_test_fprintf_c( os, "error-checking level %u\n", params->error_checking_level ); libblis_test_fprintf_c( os, "reaction to failure %c\n", params->reaction_to_failure ); libblis_test_fprintf_c( os, "output in matlab format? %u\n", params->output_matlab_format ); @@ -1503,12 +1507,12 @@ void libblis_test_op_driver( test_params_t* params, // Loop over induced methods (or just BLIS_NAT). for ( indi = ind_first; indi <= ind_last; ++indi ) { - // If the current induced method is native execution, OR - // if the current induced method is implemented (for the - // operation being tested) AND it was requested, then we - // enable ONLY that method and proceed. Otherwise, we - // skip the current method and go to the next method. - if ( indi == BLIS_NAT ) { ; } + // If the current datatype is real, OR if the current + // induced method is implemented (for the operation + // being tested) AND it was requested, then we enable + // ONLY that method and proceed. Otherwise, we skip the + // current method and go to the next method. + if ( bli_is_real( datatype ) ) { ; } else if ( bli_ind_oper_is_impl( op->opid, indi ) && params->ind_enable[ indi ] == 1 ) { ; } else { continue; } @@ -1875,22 +1879,34 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ) { - // Start with making p and alias to a. - bli_obj_alias_to( *a, *p ); + bool_t does_inv_diag; - // Then initialize p appropriately for packing. - bli_packm_init_pack( inv_diag, - pack_schema, - BLIS_PACK_FWD_IF_UPPER, - BLIS_PACK_FWD_IF_LOWER, - pack_buf, - bmult_id_m, - bmult_id_n, - a, - p, - cntx ); + if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE; + else does_inv_diag = TRUE; + + // Create a control tree node for the packing operation. + cntl_t* cntl = bli_packm_cntl_obj_create + ( + NULL, // func ptr is not referenced b/c we don't call via l3 _int(). + bli_packm_blk_var1, + bmult_id_m, + bmult_id_n, + does_inv_diag, + FALSE, + FALSE, + pack_schema, + pack_buf, + NULL // no child node needed + ); + + // Pack the contents of A to P. + bli_l3_packm( a, p, cntx, cntl, &BLIS_PACKM_SINGLE_THREADED ); + + // Return the control tree pointer so the caller can free the cntl_t and its + // mem_t entry later on. + return cntl; } @@ -1932,8 +1948,8 @@ void libblis_test_vobj_randomize( test_params_t* params, bool_t normalize, obj_t bli_obj_scalar_init_detached( dt, &kappa ); bli_obj_scalar_init_detached( dt_r, &kappa_r ); - // Normalize vector elements. - //bli_setsc( 1.0/( double )bli_obj_vector_dim( *x ), 0.0, &kappa ); + // Normalize vector elements. The following code ensures that we + // always invert-scale by whole power of two. bli_normfv( x, &kappa_r ); libblis_test_ceil_pow2( &kappa_r ); bli_copysc( &kappa_r, &kappa ); diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h index fab7c1a05..6ecc72d56 100644 --- a/testsuite/src/test_libblis.h +++ b/testsuite/src/test_libblis.h @@ -215,6 +215,7 @@ typedef struct test_ops_s // level-1v test_op_t addv; + test_op_t amaxv; test_op_t axpbyv; test_op_t axpyv; test_op_t copyv; @@ -382,7 +383,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces ); // --- Create object --- void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a ); -void libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); +cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx ); void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x ); // --- Randomize/initialize object --- @@ -435,6 +436,7 @@ void libblis_test_check_empty_problem( obj_t* c, double* perf, double* resid ); // Level-1v #include "test_addv.h" +#include "test_amaxv.h" #include "test_axpbyv.h" #include "test_axpyv.h" #include "test_copyv.h" diff --git a/testsuite/src/test_normfm.c b/testsuite/src/test_normfm.c index 5e6b76c39..b0b4735ca 100644 --- a/testsuite/src/test_normfm.c +++ b/testsuite/src/test_normfm.c @@ -145,7 +145,7 @@ void libblis_test_normfm_experiment num_t dt_real = bli_datatype_proj_to_real( datatype ); - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_normfv.c b/testsuite/src/test_normfv.c index c10135516..a4de1f882 100644 --- a/testsuite/src/test_normfv.c +++ b/testsuite/src/test_normfv.c @@ -145,7 +145,7 @@ void libblis_test_normfv_experiment num_t dt_real = bli_datatype_proj_to_real( datatype ); - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_randm.c b/testsuite/src/test_randm.c index e3defdad7..55e3920be 100644 --- a/testsuite/src/test_randm.c +++ b/testsuite/src/test_randm.c @@ -140,7 +140,7 @@ void libblis_test_randm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_randv.c b/testsuite/src/test_randv.c index e18a2ec13..776d4c647 100644 --- a/testsuite/src/test_randv.c +++ b/testsuite/src/test_randv.c @@ -140,7 +140,7 @@ void libblis_test_randv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_scal2m.c b/testsuite/src/test_scal2m.c index 144f2e03c..8e1257f25 100644 --- a/testsuite/src/test_scal2m.c +++ b/testsuite/src/test_scal2m.c @@ -149,7 +149,7 @@ void libblis_test_scal2m_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_scal2v.c b/testsuite/src/test_scal2v.c index 82f749230..9620754f2 100644 --- a/testsuite/src/test_scal2v.c +++ b/testsuite/src/test_scal2v.c @@ -149,7 +149,7 @@ void libblis_test_scal2v_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_scalm.c b/testsuite/src/test_scalm.c index 76688ee9a..3d59e3bd0 100644 --- a/testsuite/src/test_scalm.c +++ b/testsuite/src/test_scalm.c @@ -145,7 +145,7 @@ void libblis_test_scalm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_scalv.c b/testsuite/src/test_scalv.c index 90e9daf68..df10e33a9 100644 --- a/testsuite/src/test_scalv.c +++ b/testsuite/src/test_scalv.c @@ -146,7 +146,7 @@ void libblis_test_scalv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_setm.c b/testsuite/src/test_setm.c index 214c43fdf..a077baee3 100644 --- a/testsuite/src/test_setm.c +++ b/testsuite/src/test_setm.c @@ -142,7 +142,7 @@ void libblis_test_setm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_setv.c b/testsuite/src/test_setv.c index b587d7fb0..459eea6aa 100644 --- a/testsuite/src/test_setv.c +++ b/testsuite/src/test_setv.c @@ -142,7 +142,7 @@ void libblis_test_setv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_subm.c b/testsuite/src/test_subm.c index 3cae1c0a2..8e98e7e6c 100644 --- a/testsuite/src/test_subm.c +++ b/testsuite/src/test_subm.c @@ -142,7 +142,7 @@ void libblis_test_subm_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_subv.c b/testsuite/src/test_subv.c index 7cafd2a4b..c9732ad94 100644 --- a/testsuite/src/test_subv.c +++ b/testsuite/src/test_subv.c @@ -142,7 +142,7 @@ void libblis_test_subv_experiment double* resid ) { - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_symm.c b/testsuite/src/test_symm.c index 992968568..13396d849 100644 --- a/testsuite/src/test_symm.c +++ b/testsuite/src/test_symm.c @@ -160,7 +160,7 @@ void libblis_test_symm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_symv.c b/testsuite/src/test_symv.c index 89ecad953..6a6165a8d 100644 --- a/testsuite/src/test_symv.c +++ b/testsuite/src/test_symv.c @@ -155,7 +155,7 @@ void libblis_test_symv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr.c b/testsuite/src/test_syr.c index d3e51b261..525460f91 100644 --- a/testsuite/src/test_syr.c +++ b/testsuite/src/test_syr.c @@ -152,7 +152,7 @@ void libblis_test_syr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr2.c b/testsuite/src/test_syr2.c index 4ece061b0..33bf6b536 100644 --- a/testsuite/src/test_syr2.c +++ b/testsuite/src/test_syr2.c @@ -154,7 +154,7 @@ void libblis_test_syr2_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_syr2k.c b/testsuite/src/test_syr2k.c index 6bae557e3..cdb4a185e 100644 --- a/testsuite/src/test_syr2k.c +++ b/testsuite/src/test_syr2k.c @@ -158,7 +158,7 @@ void libblis_test_syr2k_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_syrk.c b/testsuite/src/test_syrk.c index afd73e7fa..e13da6543 100644 --- a/testsuite/src/test_syrk.c +++ b/testsuite/src/test_syrk.c @@ -156,7 +156,7 @@ void libblis_test_syrk_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, k; diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index 81114caa2..4099806d3 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -156,7 +156,7 @@ void libblis_test_trmm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trmm3.c b/testsuite/src/test_trmm3.c index 96645e87c..7ce850282 100644 --- a/testsuite/src/test_trmm3.c +++ b/testsuite/src/test_trmm3.c @@ -160,7 +160,7 @@ void libblis_test_trmm3_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trmv.c b/testsuite/src/test_trmv.c index 38fc02b3b..d69224a4f 100644 --- a/testsuite/src/test_trmv.c +++ b/testsuite/src/test_trmv.c @@ -151,7 +151,7 @@ void libblis_test_trmv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_trsm.c b/testsuite/src/test_trsm.c index cd0acac16..0fbc26860 100644 --- a/testsuite/src/test_trsm.c +++ b/testsuite/src/test_trsm.c @@ -156,7 +156,7 @@ void libblis_test_trsm_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c index 1592fc847..bf5f2d6bd 100644 --- a/testsuite/src/test_trsm_ukr.c +++ b/testsuite/src/test_trsm_ukr.c @@ -160,7 +160,7 @@ void libblis_test_trsm_ukr_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m, n; @@ -221,40 +221,39 @@ void libblis_test_trsm_ukr_experiment libblis_test_mobj_randomize( params, TRUE, &c ); bli_copym( &c, &c_save ); - // Initialize pack objects. - bli_obj_init_pack( &ap ); - bli_obj_init_pack( &bp ); - - // Create pack objects for a and b. - libblis_test_pobj_create( BLIS_MR, - BLIS_MR, - BLIS_INVERT_DIAG, - BLIS_PACKED_ROW_PANELS, - BLIS_BUFFER_FOR_A_BLOCK, - &a, &ap, - &cntx ); - libblis_test_pobj_create( BLIS_MR, - BLIS_NR, - BLIS_NO_INVERT_DIAG, - BLIS_PACKED_COL_PANELS, - BLIS_BUFFER_FOR_B_PANEL, - &b, &bp, - &cntx ); + // Create pack objects for a and b, and pack them to ap and bp, + // respectively. + cntl_t* cntl_a = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_MR, + BLIS_INVERT_DIAG, + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + &a, &ap, + &cntx + ); + cntl_t* cntl_b = libblis_test_pobj_create + ( + BLIS_MR, + BLIS_NR, + BLIS_NO_INVERT_DIAG, + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + &b, &bp, + &cntx + ); // Set the uplo field of ap since the default for packed objects is // BLIS_DENSE, and the _ukernel() wrapper needs this information to // know which set of micro-kernels (lower or upper) to choose from. bli_obj_set_uplo( uploa, ap ); - // Pack the contents of a to ap. - bli_packm_blk_var1( &a, &ap, &cntx, &BLIS_PACKM_SINGLE_THREADED ); - - // Repeat the experiment n_repeats times and record results. for ( i = 0; i < n_repeats; ++i ) { // Re-pack the contents of b to bp. - bli_packm_blk_var1( &b, &bp, &cntx, &BLIS_PACKM_SINGLE_THREADED ); + bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED ); bli_copym( &c_save, &c ); @@ -277,9 +276,10 @@ void libblis_test_trsm_ukr_experiment // Zero out performance and residual if output matrix is empty. libblis_test_check_empty_problem( &c, perf, resid ); - // Release packing buffers within pack objects. - bli_obj_release_pack( &ap ); - bli_obj_release_pack( &bp ); + // Free the control tree nodes and release their cached mem_t entries + // back to the memory broker. + bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED ); + bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); // Free the test objects. bli_obj_free( &a ); diff --git a/testsuite/src/test_trsv.c b/testsuite/src/test_trsv.c index dc5e118bd..a9f243103 100644 --- a/testsuite/src/test_trsv.c +++ b/testsuite/src/test_trsv.c @@ -151,7 +151,7 @@ void libblis_test_trsv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/testsuite/src/test_xpbyv.c b/testsuite/src/test_xpbyv.c index d490a565a..46f79c3ea 100644 --- a/testsuite/src/test_xpbyv.c +++ b/testsuite/src/test_xpbyv.c @@ -149,7 +149,7 @@ void libblis_test_xpbyv_experiment unsigned int n_repeats = params->n_repeats; unsigned int i; - double time_min = 1e9; + double time_min = DBL_MAX; double time; dim_t m; diff --git a/version b/version index 2bfe0beaa..0c62199f1 100644 --- a/version +++ b/version @@ -1 +1 @@ -0.2.0-37 +0.2.1