diff --git a/CHANGELOG b/CHANGELOG index 80150e185..fa6fc8dd3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,10 +1,1780 @@ -commit 18c876b989fd0dcaa27becd14e4f16bdac7e89b3 (HEAD -> master, tag: 0.6.0) +commit 68b88aca6692c75a9f686187e6c4a4e196ae60a9 (HEAD -> master, tag: 0.7.0) +Author: Field G. Van Zee +Date: Tue Apr 7 14:41:44 2020 -0500 + + Version file update (0.7.0) + +commit b04de636c1702e4cb8e7ad82bab3cf43d2dbdfc6 +Author: Field G. Van Zee +Date: Tue Apr 7 14:37:43 2020 -0500 + + ReleaseNotes.md update in advance of next version. + + Details: + - Updated docs/ReleaseNotes.md in preparation for next version. + +commit 2cb604ba472049ad498df72d4a2dc47a161d4c3c (origin/master, origin/dev, origin/amd, origin/HEAD, dev, amd) +Author: Field G. Van Zee +Date: Mon Apr 6 16:42:14 2020 -0500 + + Rename more bli_thread_obarrier(), _obroadcast(). + + Details: + - Renamed instances of bli_thread_obarrier() and bli_thread_obroadcast() + that were made in the supmt-specific code commited to the 'amd' + branch, which has now been merged with 'master'. Prior to the merge, + 'master' received commit c01d249, which applied these renamings to + the existing, non-sup codebase. + +commit efb12bc895de451067649d5dceb059b7827a025f +Author: Field G. Van Zee +Date: Mon Apr 6 15:01:53 2020 -0500 + + Minor updates/elaborations to RELEASING file. + +commit 2e3b3782cfb7a2fd0d1a325844983639756def7d +Merge: 9f3a8d4d da0c086f +Author: Field G. Van Zee +Date: Mon Apr 6 14:55:35 2020 -0500 + + Merge branch 'master' into amd + +commit da0c086f4643772e111318f95a712831b0f981a8 +Author: Satish Balay +Date: Tue Mar 31 17:09:41 2020 -0500 + + OSX: specify the full path to the location of libblis.dylib (#390) + + * OSX: specify the full path to the location of libblis.dylib so that it can be found at runtime + + Before this change: + + Appication gives runtime error [when linked with blis] + dyld: Library not loaded: libblis.3.dylib + + balay@kpro lib % otool -L libblis.dylib + libblis.dylib: + libblis.3.dylib (compatibility version 0.0.0, current version 0.0.0) + /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1281.0.0) + + After this change: + balay@kpro lib % otool -L libblis.dylib + libblis.dylib: + /Users/balay/petsc/arch-darwin-c-debug/lib/libblis.3.dylib (compatibility version 0.0.0, current version 0.0.0) + /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1281.0.0) + + * INSTALL_LIBDIR -> libdir as INSTALL_LIBDIR has DESTDIR + + Co-Authored-By: Jed Brown + + * CREDITS file update. + + Co-authored-by: Jed Brown + Co-authored-by: Field G. Van Zee + +commit 2bca03ea9d87c0da829031a5332545d05e352211 +Author: Field G. Van Zee +Date: Sat Mar 28 22:10:00 2020 +0000 + + Updates, tweaks to runme.sh in test/1m4m. + + Details: + - Made several updates to test/1m4m/runme.sh, including: + - Added missing handling for 1m and 4m1a implementations when setting + the BLIS_??_NT environment variables. + - Added support for using numactl to run the test executables. + - Several other cleanups. + +commit c40a33190b94af5d5c201be63366594859b1233f +Author: Field G. Van Zee +Date: Thu Mar 26 16:55:00 2020 -0500 + + Warn user when auto-detection returns 'generic'. + + Details: + - Added logic to configure that causes the script to output a warning + to the user if/when "./configure auto" is run and the underlying + hardware feature detection code is unable to identify the hardware. + In these cases, the auto-detect code will return 'generic', which + is likely not what the user expected, and a flag will be set so that + a message is printed at the end of the configure output. (Thankfully, + we don't expect this scenario to play out very often.) Thanks to + Devin Matthews for suggesting this fix #384. + +commit 492a736fab5b9c882996ca024b64646877f22a89 +Author: Devin Matthews +Date: Tue Mar 24 17:28:47 2020 -0500 + + Fix vectorized version of bli_amaxv (#382) + + * Fix vectorized version of bli_amaxv + + To match Netlib, i?amax should return: + - the lowest index among equal values + - the first NaN if one is encountered + + * Fix typos. + + * And another one... + + * Update ref. amaxv kernel too. + + * Re-enabled optimized amaxv kernels. + + Details: + - Re-enabled the optimized, intrinsics-based amaxv kernels in the 'zen' + kernel set for use in haswell, zen, zen2, knl, and skx subconfigs. + These two kernels (for s and d datatypes) were temporarily disabled in + e186d71 as part of issue #380. However, the key missing semantic + properties that prompted the disabling of these kernels--returning the + index of the *first* rather than of the last element with largest + absolute value, and returning the index of the first NaN if one is + encountered--were added as part of #382 thanks to Devin Matthews. + Thus, now that the kernels are working as expected once more, this + commit causes these kernels to once again be registered for the + affected subconfigs, which effectively reverts all code changes + included in e186d71. + - Whitespace/formatting updates to new macros in bli_amaxv_zen_int.c. + + Co-authored-by: Field G. Van Zee + +commit e186d7141a51f2d7196c580e24e7b7db8f209db9 +Author: Field G. Van Zee +Date: Sat Mar 21 18:40:36 2020 -0500 + + Disabled optimized amaxv kernels. + + Details: + - Disabled use of optimized amaxv kernels, which use vector intrinsics + for both 's' and 'd' datatypes. We disable these kernels because the + current implementations fail to observe a semantic property of the + BLAS i?amax_() subroutine, which is to return the index of the + *first* element containing the maximum absolute value (that is, the + first element if there exist two or more elements that contain the + same value). With the optimized kernels disabled, the affected + subconfigurations (haswell, zen, zen2, knl, and skx) will use the + default reference implementations. Thanks to Mat Cross for reporting + this issue via #380. + - CREDITS file update. + +commit 9f3a8d4d851725436b617297231a417aa9ce8c6a +Author: Field G. Van Zee +Date: Sat Mar 14 17:48:43 2020 -0500 + + Added missing return to bli_thread_partition_2x2(). + + Details: + - Added a missing return statement to the body of an early case handling + branch in bli_thread_partition_2x2(). This bug only affected cases + where n_threads < 4, and even then, the code meant to handle cases + where n_threads >= 4 executes and does the right thing, albeit using + more CPU cycles than needed. Nonetheless, thanks to Kiran Varaganti + for reporting this bug via issue #377. + - Whitespace changes to bli_thread.c (spaces -> tabs). + +commit 8c3d9b9eeb6f816ec8c32a944f632a5ad3637593 +Merge: 71249fe8 0f9e0399 +Author: Field G. Van Zee +Date: Tue Mar 10 14:03:33 2020 -0500 + + Merge branch 'amd' of github.com:flame/blis into amd + +commit 71249fe8ddaa772616698f1e3814d40e012909ea +Author: Field G. Van Zee +Date: Tue Mar 10 13:55:29 2020 -0500 + + Merged test/sup, test/supmt into test/sup. + + Details: + - Updated the Makefile, test_gemm.c, and runme.sh in test/sup to be able + to compile and run both single-threaded and multithreaded experiments. + This should help with maintenance going forward. + - Created a test/sup/octave_st directory of scripts (based on the + previous test/sup/octave scripts) as well as a test/sup/octave_mt + directory (based on the previous test/supmt/octave scripts). The + octave scripts are slightly different and not easily mergeable, and + thus for now I'll maintain them separately. + - Preserved the previous test/sup directory as test/sup/old/supst and + the previous test/supmt directory as test/sup/old/supmt. + +commit 0f9e0399e16e96da2620faf2c0c3c21274bb2ebd +Author: Field G. Van Zee +Date: Thu Mar 5 17:03:21 2020 -0600 + + Updated sup performance graphs; added mt results. + + Details: + - Reran all existing single-threaded performance experiments comparing + BLIS sup to other implementations (including the conventional code + path within BLIS), using the latest versions (where appropriate). + - Added multithreaded results for the three existing hardware types + showcased in docs/PerformanceSmall.md: Kaby Lake, Haswell, and Epyc + (Zen1). + - Various minor updates to the text in docs/PerformanceSmall.md. + - Updates to the octave scripts in test/sup/octave, test/supmt/octave. + +commit 90db88e5729732628c1f3acc96eeefab49f2da41 +Author: Field G. Van Zee +Date: Mon Mar 2 15:06:48 2020 -0600 + + Updated sup[mt] Makefiles for variable dim ranges. + + Details: + - Updated test/sup/Makefile and test/supmt/Makefile to allow specifying + different problem size ranges for the drivers where one, two, or three + matrix dimensions is large. This will facilitate the generation of + more meaningful graphs, particularly when two dimensions are tiny. + +commit 31f11a06ea9501724feec0d2fc5e4644d7dd34fc +Author: Field G. Van Zee +Date: Thu Feb 27 14:33:20 2020 -0600 + + Updates to octave scripts in test/sup[mt]/octave. + + Details: + - Optimized scripts in test/sup/octave and test/supmt/octave for use + with octave 5.2.0 on Ubuntu 18.04. + - Fixed stray 'end' keywords in gen_opsupnames.m and plot_l3sup_perf.m, + which were not only unnecessary but also causing issues with versions + 5.x. + +commit c01d249d7c546fe2e3cee3fe071cd4c4c88b9115 +Author: Field G. Van Zee +Date: Tue Feb 25 14:50:53 2020 -0600 + + Renamed bli_thread_obarrier(), _obroadcast(). + + Details: + - Renamed two bli_thread_*() APIs: + bli_thread_obarrier() -> bli_thread_barrier() + bli_thread_obroadcast() -> bli_thread_broadcast() + The 'o' was a leftover from when thrcomm_t objects tracked both + "inner" and "outer" communicators. They have long since been + simplified to only support the latter, and thus the 'o' is + superfluous. + +commit f6e6bf73e695226c8b23fe7900da0e0ef37030c1 +Author: Field G. Van Zee +Date: Mon Feb 24 17:52:23 2020 -0600 + + List Gentoo under supported external packages. + + Details: + - Add mention of Gentoo Linux under the list of external packages in + the README.md file. Thanks to M. Zhou for maintaining this package. + +commit 9e5f7296ccf9b3f7b7041fe1df20b927cd0e914b +Author: Field G. Van Zee +Date: Tue Feb 18 15:16:03 2020 -0600 + + Skip building thrinfo_t tree when mt is disabled. + + Details: + - Return early from bli_thrinfo_sup_grow() if the thrinfo_t object + address is equal to either &BLIS_GEMM_SINGLE_THREADED or + &BLIS_PACKM_SINGLE_THREADED. + - Added preprocessor logic to bli_l3_sup_thread_decorator() in + bli_l3_sup_decor_single.c that (by default) disables code that + creates and frees the thrinfo_t tree and instead passes + &BLIS_GEMM_SINGLE_THREADED as the thrinfo_t pointer into the + sup implementation. + - The net effect of the above changes is that a small amount of + thrinfo_t overhead is avoided when running small/skinny dgemm + problems when BLIS is compiled with multithreading disabled. + +commit 90081e6a64b5ccea9211bdef193c2d332c68492f +Author: Field G. Van Zee +Date: Mon Feb 17 14:57:25 2020 -0600 + + Fixed bug(s) in mt sup when single-threaded. + + Details: + - Fixed a syntax bug in bli_l3_sup_decor_single.c as a result of + changing function interface for the thread entry point function + (of type l3supint_t). + - Unfortunately, fixing the interface was not enough, as it caused + a memory leak in the sba at bli_finalize() time. It turns out that, + due to the new multithreading-capable variant code useing thrinfo_t + objects--specifically, their calling of bli_thrinfo_grow()--we + have to pass in a real thrinfo_t object rather than the global + objects &BLIS_PACKM_SINGLE_THREADED or &BLIS_GEMM_SINGLE_THREADED. + Thus, I inserted the appropriate logic from the OpenMP and pthreads + versions so that single-threaded execution would work as intended + with the newly upgraded variants. + +commit c0558fde4511557c8f08867b035ee57dd2669dc6 +Author: Field G. Van Zee +Date: Mon Feb 17 14:08:08 2020 -0600 + + Support multithreading within the sup framework. + + Details: + - Added multithreading support to the sup framework (via either OpenMP + or pthreads). Both variants 1n and 2m now have the appropriate + threading infrastructure, including data partitioning logic, to + parallelize computation. This support handles all four combinations + of packing on matrices A and B (neither, A only, B only, or both). + This implementation tries to be a little smarter when automatic + threading is requested (e.g. via BLIS_NUM_THREADS) in that it will + recalculate the factorization in units of micropanels (rather than + using the raw dimensions) in bli_l3_sup_int.c, when the final + problem shape is known and after threads have already been spawned. + - Implemented bli_?packm_sup_var2(), which packs to conventional row- + or column-stored matrices. (This is used for the rrc and crc storage + cases.) Previously, copym was used, but that would no longer suffice + because it could not be parallelized. + - Minor reorganization of packing-related sup functions. Specifically, + bli_packm_sup_init_mem_[ab]() are called from within packm_sup_[ab]() + instead of from the variant functions. This has the effect of making + the variant functions more readable. + - Added additional bli_thrinfo_set_*() static functions to bli_thrinfo.h + and inserted usage of these functions within bli_thrinfo_init(), which + previously was accessing thrinfo_t fields via the -> operator. + - Renamed bli_partition_2x2() to bli_thread_partition_2x2(). + - Added an auto_factor field to the rntm_t struct in order to track + whether automatic thread factorization was originally requested. + - Added new test drivers in test/supmt that perform multithreaded sup + tests, as well as appropriate octave/matlab scripts to plot the + resulting output files. + - Added additional language to docs/Multithreading.md to make it clear + that specifying any BLIS_*_NT variable, even if it is set to 1, will + be considered manual specification for the purposes of determining + whether to auto-factorize via BLIS_NUM_THREADS. + - Minor comment updates. + +commit d7a7679182d72a7eaecef4cd9b9a103ee0a7b42b +Author: Field G. Van Zee +Date: Fri Feb 7 17:37:03 2020 -0600 + + Fixed int-to-packbuf_t conversion error (C++ only). + + Details: + - Fixed an error that manifests only when using C++ (specifically, + modern versions of g++) to compile drivers in 'test' (and likely most + other application code that #includes blis.h. Thanks to Ajay Panyala + for reporting this issue (#374). + +commit d626112b8d5302f9585fb37a8e37849747a2a317 +Author: Field G. Van Zee +Date: Wed Jan 15 13:27:02 2020 -0600 + + Removed sorting on LDFLAGS in common.mk (#373). + + Details: + - Removed a line of code in common.mk that passed LDFLAGS through the + sort function. The purpose was not to sort the contents, but rather + to remove duplicates. However, there is valid syntax in a string of + linker flags that, when sorted, yields different/broken behavior. + So I've removed the line in common.mk that sorts LDFLAGS. Also, for + future use, I've added a new function, rm-dupls, that removes + duplicates without sorting. (This function was based on code from a + stackoverflow thread that is linked to in the comments for that + code.) Thanks to Isuru Fernando for reporting this issue (#373). + +commit e67deb22aaeab5ed6794364520190936748ef272 +Author: Field G. Van Zee +Date: Tue Jan 14 16:01:34 2020 -0600 + + CHANGELOG update (0.6.1) + +commit 10949f528c5ffc5c3a2cad47fe16a802afb021be (tag: 0.6.1) +Author: Field G. Van Zee +Date: Tue Jan 14 16:01:33 2020 -0600 + + Version file update (0.6.1) + +commit 5db8e710a2baff121cba9c63b61ca254a2ec097a +Author: Field G. Van Zee +Date: Tue Jan 14 15:59:59 2020 -0600 + + ReleaseNotes.md update in advance of next version. + + Details: + - Updated ReleaseNotes.md in preparation for next version. + +commit cde4d9d7a26eb51dcc5a59943361dfb8fda45dea +Author: Field G. Van Zee +Date: Tue Jan 14 15:19:25 2020 -0600 + + Removed 'attic/windows' (to prevent confusion). + + Details: + - Finally removed 'attic/windows' and its contents. This directory once + contained "proto" Windows support for BLIS, but we've since moved on + to (thanks to Isuru Fernando) providing Windows DLL support via + AppVeyor's build artifacts. Furthermore, since 'windows' was the only + subdirectory within 'attic', the directory path would show up in + GitHub's listing at https://github.com/flame/blis, which probably led + to someone being confused about how BLIS provides Windows support. I + assume (but don't know for sure) that nobody is using these files, so + this is admittedly a case of shoot first and ask questions later. + +commit 7d3407d4681c6449f4bbb8ec681983700ab968f3 +Author: Field G. Van Zee +Date: Tue Jan 14 15:17:53 2020 -0600 + + CREDITS file update. + +commit f391b3e2e7d11a37300d4c8d3f6a584022a599f5 +Author: Dave Love +Date: Mon Jan 6 20:15:48 2020 +0000 + + Fix parsing in vpu_count on workstation SKX (#351) + + * Fix parsing in vpu_count on workstation SKX + + * Document Skylake-X as Haswell for single FMA + + * Update vpu_count for Skylake and Cascade Lake models + + * Support printing the configuration selected, controlled by the environment + + Intended particularly for diagnosing mis-selection of SKX through + unknown, or incorrect, number of VPUs. + + * Move bli_log outside the cpp condition, and use it where intended + + * Add Fixme comment (Skylake D) + + * Mostly superficial edits to commits towards #351. + + Details: + - Moved architecture/sub-config logging-related code from bli_cpuid.c + to bli_arch.c, tweaked names, and added more set/get layering. + - Tweaked log messages output from bli_cpuid_is_skx() in bli_cpuid.c. + - Content, whitespace changes to new bullet in HardwareSupport.md that + relates to single-VPU Skylake-Xs. + + * Fix comment typos + + Co-authored-by: Field G. Van Zee + +commit 5ca1a3cfc1c1cc4dd9da6a67aa072ed90f07e867 +Author: Field G. Van Zee +Date: Mon Jan 6 12:29:12 2020 -0600 + + Fixed 'configure' breakage introduced in 6433831. + + Details: + - Added a missing 'fi' (endif) keyword to a conditional block added in + the configure script in commit 6433831. + +commit e7431b4a834ef4f165c143f288585ce8e2272a23 +Author: Field G. Van Zee +Date: Mon Jan 6 12:01:41 2020 -0600 + + Updated 1m draft article link in README.md. + +commit 6433831cc3988ad205637ebdebcd6d8f7cfcf148 +Author: Jeff Hammond +Date: Fri Jan 3 17:52:49 2020 -0800 + + blacklist ICC 18 for knl/skx due to test failures + + Signed-off-by: Jeff Hammond + +commit af3589f1f98781e3a94a8f9cea8d5ea6f155f7d2 +Author: Jeff Hammond +Date: Fri Jan 3 13:23:24 2020 -0800 + + blacklist Intel 19+ + + Signed-off-by: Jeff Hammond + +commit 60de939debafb233e57fd4e804ef21b6de198caf +Author: Jeff Hammond +Date: Wed Jan 1 21:30:38 2020 -0800 + + fix link to docs + + the comment contains an incorrect link, which is trivially fixed here. + + @fgvanzee I hope you don't mind that I committed directly to master but this cannot break anything. + +commit 52711073789b6b84eb99bb0d6883f457ed3fcf80 +Author: Field G. Van Zee +Date: Mon Dec 16 16:30:26 2019 -0600 + + Fixed bugs in cblas_sdsdot(), sdsdot_(). + + Details: + - Fixed a bug in sdsdot_sub() that redundantly added the "alpha" scalar, + named 'sb'. This value was already being added by the underlying + sdsdot_() function. Thus, we no longer add 'sb' within sdsdot_sub(). + Thanks to Simon Lukas Märtens for reporting this bug via #367. + - Fixed a second bug in order of typecasting intermediate products in + sdsdot_(). Previously, the "alpha" scalar was being added after the + "outer" typecast to float. However, the operation is supposed to first + add the dot product to the (promoted) scalar and THEN downcast the sum + to float. Thanks to Devin Matthews for catching this bug. + +commit fe2560a4b1d8ef8d0a446df6002b1e7decc826e9 +Author: Field G. Van Zee +Date: Fri Dec 6 17:12:44 2019 -0600 + + Annoted missing thread-related symbols for export. + + Details: + - Added BLIS_EXPORT_BLIS annotation to function prototypes for + + bli_thrcomm_bcast() + bli_thrcomm_barrier() + bli_thread_range_sub() + + so that these functions are exported to shared libraries by default. + This (hopefully) fixes issue #366. Thanks to Kyungmin Lee for + reporting this bug. + - CREDITS file update. + +commit 2853825234001af8f175ad47cef5d6ff9b7a5982 +Merge: efa61a6c 61b1f0b0 +Author: Field G. Van Zee +Date: Fri Dec 6 16:06:46 2019 -0600 + + Merge branch 'master' into amd + +commit 61b1f0b0602faa978d9912fe58c6c952a33af0ac +Author: Nicholai Tukanov +Date: Wed Dec 4 14:18:47 2019 -0600 + + Add prototypes for POWER9 reference kernels (#365) + + Updates and fixes to power9 subconfig. + + Details: + - Register s,c,z reference gemm and trsm ukernels that assume elements + of B have been broadcast. + - Added prototypes for level-3 ukernels that assume elements of B have + been broadcast. Also added prototype for an spackm function that + employs a duplication/broadcast factor of 4. + - Register virtual gemmtrsm ukernels that work with broadcasting of B. + - Disable right-side hemm, symm, trmm, and trmm3 in bli_family_power9.h. + - Thanks to Nicholai Tukanov for providing these updates. + +commit efa61a6c8b1cfa48781fc2e4799ff32e1b7f8f77 +Author: Field G. Van Zee +Date: Fri Nov 29 16:17:04 2019 -0600 + + Added missing bli_l3_sup_thread_decorator() symbol. + + Details: + - Defined dummy versions of bli_l3_sup_thread_decorator() for Openmp + and pthreads so that those builds don't fail when performing shared + library linking (especially for Windows DLLs via AppVeyor). For now, + these dummy implementations of bli_l3_sup_thread_decorator() are + merely carbon-copies of the implementation provided for single- + threaded execution (ie: the one found in bli_l3_sup_decor_single.c). + Thus, an OpenMP or pthreads build will be able to use the gemmsup + code (including the new selective packing functionality), as it did + before 39fa7136, even though it will not actually employ any + multithreaded parallelism. + +commit 39fa7136f4a4e55ccd9796fb79ad5f121b872ad9 +Author: Field G. Van Zee +Date: Fri Nov 29 15:27:07 2019 -0600 + + Added support for selective packing to gemmsup. + + Details: + - Implemented optional packing for A or B (or both) within the sup + framework (which currently only supports gemm). The request for + packing either matrix A or matrix B can be made via setting + environment variables BLIS_PACK_A or BLIS_PACK_B (to any + non-zero value; if set, zero means "disable packing"). It can also + be made globally at runtime via bli_pack_set_pack_a() and + bli_pack_set_pack_b() or with individual rntm_t objects via + bli_rntm_set_pack_a() and bli_rntm_set_pack_b() if using the expert + interface of either the BLIS typed or object APIs. (If using the + BLAS API, environment variables are the only way to communicate the + packing request.) + - One caveat (for now) with the current implementation of selective + packing is that any blocksize extension registered in the _cntx_init + function (such as is currently used by haswell and zen subconfigs) + will be ignored if the affected matrix is packed. The reason is + simply that I didn't get around to implementing the necessary logic + to pack a larger edge-case micropanel, though this is entirely + possible and should be done in the future. + - Spun off the variant-choosing portion of bli_gemmsup_ref() into + bli_gemmsup_int(), in bli_l3_sup_int.c. + - Added new files, bli_l3_sup_packm_a.c, bli_l3_sup_packm_b.c, along + with corresponding headers, in which higher-level packm-related + functions are defined for use within the sup framework. The actual + packm variant code resides in bli_l3_sup_packm_var.c. + - Pass the following new parameters into var1n and var2m: packa, packb + bool_t's, pointer to a rntm_t, pointer to a cntl_t (which is for now + always NULL), and pointer to a thrinfo_t* (which for nowis the address + of the global single-threaded packm thread control node). + - Added panel strides ps_a and ps_b to the auxinfo_t structure so that + the millikernel can query the panel stride of the packed matrix and + step through it accordingly. If the matrix isn't packed, the panel + stride of interest for the given millikernel will be set to the + appropriate value so that the mkernel may step through the unpacked + matrix as it normally would. + - Modified the rv_6x8m and rv_6x8n millikernels to read the appropriate + panel strides (ps_a and ps_b, respectively) instead of computing them + on the fly. + - Spun off the environment variable getting and setting functions into + a new file, bli_env.c (with a corresponding prototype header). These + functions are now used by the threading infrastructure (e.g. + BLIS_NUM_THREADS, BLIS_JC_NT, etc.) as well as the selective packing + infrastructure (e.g. BLIS_PACK_A, BLIS_PACK_B). + - Added a static initializer for mem_t objects, BLIS_MEM_INITIALIZER. + - Added a static initializer for pblk_t objects, BLIS_PBLK_INITIALIZER, + for use within the definition of BLIS_MEM_INITIALIZER. + - Moved the global_rntm object to bli_rntm.c and extern it where needed. + This means that the function bli_thread_init_rntm() was renamed to + bli_rntm_init_from_global() and relocated accordingly. + - Added a new bli_pack.c function, which serves as the home for + functions that manage the pack_a and pack_b fields of the global + rntm_t, including from environment variables, just as we have + functions to manage the threading fields of the global rntm_t in + bli_thread.c. + - Reorganized naming for files in frame/thread, which mostly involved + spinning off the bli_l3_thread_decorator() functions into their own + files. This change makes more sense when considering the further + addition of bli_l3_sup_thread_decorator() functions (for now limited + only to the single-threaded form found in the _single.c file). + - Explicitly initialize the reference sup handlers in both + bli_cntx_init_haswell.c and bli_cntx_init_zen.c so that it's more + obvious how to customize to a different handler, if desired. + - Removed various snippets of disabled code. + - Various comment updates. + +commit bbb21fd0a9be8c5644bec37c75f9396eeeb69e48 +Author: Field G. Van Zee +Date: Thu Nov 21 18:15:16 2019 -0600 + + Tweaked SIAM/SC Best Prize language in README.md. + +commit 043366f92d5f5f651d5e3371ac3adb36baf4adce +Author: Field G. Van Zee +Date: Thu Nov 21 18:13:51 2019 -0600 + + Fixed typo in previous commit (SIAM/SC prize). + +commit 05a4d583e65a46ff2a1100ab4433975d905d91f9 +Author: Field G. Van Zee +Date: Thu Nov 21 18:12:24 2019 -0600 + + Added SIAM/SC prize to "What's New" in README.md. + +commit 881b05ecd40c7bc0422d3479a02a28b1cb48383f +Author: Field G. Van Zee +Date: Thu Nov 21 16:34:27 2019 -0600 + + Fixed blastest failure for 'generic' subconfig. + + Details: + - Fixed a subtle and complicated bug that only manifested via the BLAS + test drivers in the generic subconfiguration, and possibly any other + subconfiguration that did not register complex-domain gemm ukernels, + or registered ONLY real-domain ukernels as row-preferential. This is + a long story, but it boils down to an exception to the "transpose the + operation to bring storage of C into agreement with ukernel pref" + optimization in bli_hemm_front.c and bli_symm_front.c sabotaging the + proper functioning of the 1m method, but only when the imaginary + component of beta is zero. See the comments in issue #342 for more + details. Thanks to Dave Love for identifying the commit in which this + bug was introduced, and other feedback related to this bug. + +commit 0c7165fb01cdebbc31ec00124d446161b289942f +Author: Field G. Van Zee +Date: Thu Nov 14 16:48:14 2019 -0600 + + Fixed obscure bug in bli_acquire_mpart_[mn]dim(). + + Details: + - Fixed a bug in bli_acquire_mpart_mdim(), bli_acquire_mpart_ndim(), + and bli_acquire_mpart_mndim() that allowed the use of a blocksize b + that is too large given the current row/column index (i.e., the i/j + argument) and the size of the dimension being partitioned (i.e., the + m/n argument). This bug only affected backwards partitioning/motion + through the dimension and was the result of a misplaced conditional + check-and-redirect to the backwards code path. It should be noted + that this bug was discovered not because it manifested the way it + could (thanks to the callers in BLIS making sure to always pass in + the "correct" blocksize b), but could have manifested if the + functions were used by 3rd party callers. Thanks to Minh Quan Ho for + reporting the bug via issue #363. + +commit fb8bef9982171ee0f60bc39e41a33c4d31fd59a9 +Author: Field G. Van Zee +Date: Thu Nov 14 13:05:28 2019 -0600 + + Fixed copy-paste bug in bli_spackm_6xk_bb4_ref(). + + Details: + - Fixed a copy-paste bug in the new bli_spackm_6xk_bb4_ref() that + manifested as failures in single-precision real level-3 operations. + Also replaced the duplication factor constants with a const-qualifed + varialbe, dfac, so that this won't happen again. + - Changed NC for single-precision real from 4080 to 8160 so that the + packed matrix B will have the same byte footprint in both single + and double real. + +commit 8f399c89403d5824ba767df1426706cf2d19d0a7 +Author: Field G. Van Zee +Date: Tue Nov 12 15:32:57 2019 -0600 + + Tweaked/added notes to docs/Multithreading.md. + + Details: + - Added language to docs/Multithreading.md cautioning the reader about + the nuances of setting multithreading parameters via the manual and + automatic ways simultaneously, and also about how these parameters + behave when multithreading is disabled at configure-time. These + changes are an attempt to address the issues that arose in issue #362. + Thanks to Jérémie du Boisberranger for his feedback on this topic. + - CREDITS file update. + +commit bdc7ee3394500d8e5b626af6ff37c048398bb27e +Author: Field G. Van Zee +Date: Mon Nov 11 15:47:17 2019 -0600 + + Various fixes to support packing duplication in B. + + Details: + - Added cpp macros to trmm and trmm3 front-ends to optionally force + those operations to be cast so the structured matrix is on the left. + symm and hemm already had such macros, but these too were renamed so + that the macros were individual to the operation. We now have four + such macros: + #define BLIS_DISABLE_HEMM_RIGHT + #define BLIS_DISABLE_SYMM_RIGHT + #define BLIS_DISABLE_TRMM_RIGHT + #define BLIS_DISABLE_TRMM3_RIGHT + Also, updated the comments in the symm and hemm front-ends related to + the first two macro guards, and added corresponding comments to the + trmm and trmm3 front-ends for the latter two guards. (They all + functionally do the same thing, just for their specific operations.) + Thanks to Jeff Hammond for reporting the bugs that led me to this + change (via #359). + - Updated config/old/haswellbb subconfiguration (used to debug issues + related to duplicating B during packing) to register: a packing + kernel for single-precision real; gemmbb ukernels for s, c, and z; + trsmbb ukernels for s, c, and z; gemmtrsmbb virtual ukrnels for s, c + and z; and to use non-default cache and register blocksizes for s, c, + and z datatypes. Also declared prototypes for all of the gemmbb, + trsmbb, and gemmtrsmbb ukernel functions within the + bli_cntx_init_haswellbb() function. This should, once applied to the + power9 configuration, fix the remaining issues in #359. + - Defined bli_spackm_6xk_bb4_ref(), which packs single reals with a + duplication factor of 4. This function is defined in the same file as + bli_dpackm_6xk_bb2_ref() (bli_packm_cxk_bb_ref.c). + +commit 0eb79ca8503bd7b237994335b9687457227d3290 +Author: Field G. Van Zee +Date: Fri Nov 8 14:48:48 2019 -0600 + + Avoid unused variable warning in lread.c (#356). + + Details: + - Replaced the line + + f = f; + + with + + ( void )f; + + for the unused variable 'f' in blastest/f2c/lread.c. (Hopefully) + addresses issue #356, but since we don't use xlc who knows. Thanks + to Jeff Hammond for reporting this. + +commit f377bb448512f0b578263387eed7eaf8f2b72bb7 +Author: Jérôme Duval +Date: Thu Nov 7 23:39:29 2019 +0100 + + Add Haiku to the known OS list (#361) + +commit e29b1f9706b6d9ed798b7f6325f275df4e6be973 +Author: Field G. Van Zee +Date: Tue Nov 5 17:15:19 2019 -0600 + + Fixed failing testsuite gemmtrsm_ukr for power9. + + Details: + - Added code that fixes false failures in the gemmtrsm_ukr module of the + testsuite. The tests were failing because the computation (bli_gemv()) + that performs the numerical check was not able to properly travserse + the matrix operands bx1 and b11 that are views into the micropanel of + B, which has duplicated/broadcast elements under the power9 subconfig. + (For example, a micropanel of B with duplication factor of 2 needs to + use a column stride of 2; previously, the column stride was being + interpreted as 1.) + - Defined separate bli_obj_set_row_stride() and bli_obj_set_col_stride() + static functions in bli_obj_macro_defs.h. (Previously, only the + function bli_obj_set_strides() was defined. Amazing to think that we + got this far without these former functions.) + - Updated/expounded upon comments. + +commit 49177a6b9afcccca5b39a21c6fd8e243525e1505 +Author: Field G. Van Zee +Date: Mon Nov 4 18:09:37 2019 -0600 + + Fixed latent testsuite ukr module bugs for power9. + + Details: + - Fixed a latent bug in the testsuite ukernel modules (gemm, trsm, and + gemmtrsm) that only manifested once we began running with parameters + that mimic those of power9. The problem was rooted in the way those + modules were creating objects (and thus allocating memory) for the + micropanel operands to the microkernel being tested. Since power9 + duplicates/broadcasts elements of B in memory, we needed an easy way + of asking for more than one storage element per logical element in + the matrix. I incorrectly expressed this as: + + bli_obj_create( datatype, k, n, ldbp, 1, &bp ); + + The problem here is that bli_obj_create() is exceedingly efficient + at calculating the size it passes to malloc() and doesn't allocate a + full leading dimension's worth of elements for the last column (or + row, in this example). This would normally not bother anyone since + you're not supposed to access that memory anyway. But here, my + attempted "hack" for getting extra elements was insufficient, and + needed to be changed to: + + bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp ); + + That is, the extra elements needed to be baked into the dimensions of + the matrix object in order to have the intended effect on the number + of elements actually allocated. Thanks to Jeff Hammond for reporting + this bug. + - Fixed a typically harmless memory leak in the aforementioned test + modules (the objects for the packed micropanels were not being freed). + - Updated/expanded a common comment across all three ukr test modules. + +commit c84391314d4f1b3f73d868f72105324e649f2a72 +Author: Field G. Van Zee +Date: Mon Nov 4 13:57:12 2019 -0600 + + Reverted minor temp/wspace changes from b426f9e. + + Details: + - Added missing license header to bli_pwr9_asm_macros_12x6.h. + - Reverted temporary changes to various files in 'test' and 'testsuite' + directories. + - Moved testsuite/jobscripts into testsuite/old. + - Minor whitespace/comment changes across various files. + +commit 4870260f6b8c06d2cc01b7147d7433ddee213f7f +Author: Jeff Hammond +Date: Mon Nov 4 11:55:47 2019 -0800 + + blacklist GCC 5 and older for POWER9 (#360) + +commit b426f9e04e5499c6f9c752e49c33800bfaadda4c +Author: Nicholai Tukanov +Date: Fri Nov 1 17:57:03 2019 -0500 + + POWER9 DGEMM (#355) + + Implemented and registered power9 dgemm ukernel. + + Details: + - Implemented 12x6 dgemm microkernel for power9. This microkernel + assumes that elements of B have been duplicated/broadcast during the + packing step. The microkernel uses a column orientation for its + microtile vector registers and thus implements column storage and + general stride IO cases. (A row storage IO case via in-register + transposition may be added at a future date.) It should be noted that + we recommend using this microkernel with gcc and *not* xlc, as issues + with the latter cropped up during development, including but not + limited to slightly incompatible vector register mnemonics in the GNU + extended inline assembly clobber list. + +commit 58102aeaa282dc79554ed045e1b17a6eda292e15 +Merge: 52059506 b9bc222b +Author: Field G. Van Zee +Date: Mon Oct 28 17:58:31 2019 -0500 + + Merge branch 'amd' + +commit 52059506b2d5fd4c3738165195abeb356a134bd4 +Author: Field G. Van Zee +Date: Wed Oct 23 15:26:42 2019 -0500 + + Added "How to Download BLIS" section to README.md. + + Details: + - Added a new section to the README.md, just prior to the "Getting + Started" section, titled "How to Download BLIS". This section details + the user's options for obtaining BLIS and lays out four common ways + of downloading the library. Thanks to Jeff Diamond for his feedback + on this topic. + +commit e6f0a96cc59aef728470f6850947ba856148c38a +Author: Field G. Van Zee +Date: Mon Oct 14 17:05:39 2019 -0500 + + Updated README.md to ack Facebook as funder. + +commit b9bc222bfc3db4f9ae5d7b3321346eed70c2c3fb +Author: Field G. Van Zee +Date: Mon Oct 14 16:38:15 2019 -0500 + + Call bli_syrk_small() before error checking. + + Details: + - In bli_syrk_front(), moved the conditional call to bli_syrk_check() + (if error checking is enabled) and the conditional scaling of C by + beta (if alpha is zero) so that they occur after, instead of before, + the call to bli_syrk_small(). This sequencing now matches that of + bli_gemm_small() in bli_gemm_front() and bli_trsm_small() in + bli_trsm_front(). + +commit f0959a81dbcf30d8a1076d0a6348a9835079d31a +Author: Field G. Van Zee +Date: Mon Oct 14 15:46:28 2019 -0500 + + When manual config is blacklisted, output error. + + Details: + - Fixed and adjusted the logic in configure so that a more informative + error message is output when a user runs './configure ... ' and + is present in the configuration blacklist. Previously, this + particular set of conditions would result in the message: + + 'user-specified configuration '' is NOT registered! + + That is, the error message mis-identified the targeted configuration + as the empty string, and (more importantly) mis-identifies the + problem. Thanks to Tze Meng Low for reporting this issue. + - Fixed a nearby error messages somewhat unrelated to the issue above. + Specifically, the wrong string was being printed when the error + message was identifying an auto-detected configuration that did not + appear to be registered. + +commit 6218ac95a525eefa8921baf8d0d7057dfacebe9c +Merge: 0016d541 a617301f +Author: Field G. Van Zee +Date: Fri Oct 11 11:53:51 2019 -0500 + + Merge branch 'master' into amd + +commit 0016d541e6b0da617b1fae6612d2b314901b7a75 +Author: Field G. Van Zee +Date: Fri Oct 11 11:09:44 2019 -0500 + + Changed -march=znver2 to =znver1 for clang on zen2. + + Details: + - In config/zen2/make_defs.mk, changed the -march= flag so that + -march=znver1 is used instead of -march=znver2 when CC_VENDOR is + clang. (The gcc branch attempts to differentiate between various + versions, but the equivalent version cutoffs for clang are not + yet known by us, so we have to use a single flag for all versions + of clang. Hopefully -march=znver1 is new enough. If not, we'll + fall back to -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp.) + This issue was discovered thanks to AppVeyor. + +commit e94a0530e5ac4c78a18f09105f40003be2b517f7 +Author: Field G. Van Zee +Date: Fri Oct 11 10:48:27 2019 -0500 + + Corrected zen NC that was non-multiple of NR. + + Details: + - Updated an incorrectly set cache blocksize NC for single real within + config/zen/bli_cntx_init_zen.c that was non a multiple of the + corresponding value of NR. This issue, which was caught by Travis CI, + was introduced in 29b0e1e. + +commit a2ffac752076bf55eb8c1fe2c5da8d9104f1f85b +Merge: 1cfe8e25 29b0e1ef +Author: Field G. Van Zee +Date: Fri Oct 11 10:31:18 2019 -0500 + + Merge branch 'amd-master' into amd + +commit 29b0e1ef4e8b84ce76888d73c090009b361f1306 +Merge: 1cfe8e25 fdce1a56 +Author: Field G. Van Zee +Date: Fri Oct 11 10:24:24 2019 -0500 + + Code review + tweaks to AMD's AOCL 2.0 PR (#349). + + Details: + - NOTE: This is a merge commit of 'master' of git://github.com/amd/blis + into 'amd-master' of flame/blis. + - Fixed a bug in the downstream value of BLIS_NUM_ARCHS, which was + inadvertantly not incremented when the Zen2 subconfiguration was + added. + - In bli_gemm_front(), added a missing conditional constraint around the + call to bli_gemm_small() that ensures that the computation precision + of C matches the storage precision of C. + - In bli_syrk_front(), reorganized and relocated the notrans/trans logic + that existed around the call to bli_syrk_small() into bli_syrk_small() + to minimize the calling code footprint and also to bring that code + into stylistic harmony with similar code in bli_gemm_front() and + bli_trsm_front(). Also, replaced direct accessing of obj_t fields with + proper accessor static functions (e.g. 'a->dim[0]' becomes + 'bli_obj_length( a )'). + - Added #ifdef BLIS_ENABLE_SMALL_MATRIX guard around prototypes for + bli_gemm_small(), bli_syrk_small(), and bli_trsm_small(). This is + strictly speaking unnecessary, but it serves as a useful visual cue to + those who may be reading the files. + - Removed cpp macro-protected small matrix debugging code from + bli_trsm_front.c. + - Added a GCC_OT_9_1_0 variable to build/config.mk.in to facilitate gcc + version check for availability of -march=znver2, and added appropriate + support to configure script. + - Cleanups to compiler flags common to recent AMD microarchitectures in + config/zen/amd_config.mk, including: removal of -march=znver1 et al. + from CKVECFLAGS (since the -march flag is added within make_defs.mk); + setting CRVECFLAGS similarly to CKVECFLAGS. + - Cleanups to config/zen/bli_cntx_init_zen.c. + - Cleanups, added comments to config/zen/make_defs.mk. + - Cleanups to config/zen2/make_defs.mk, including making use of newly- + added GCC_OT_9_1_0 and existing GCC_OT_6_1_0 to choose the correct + set of compiler flags based on the version of gcc being used. + - Reverted downstream changes to test/test_gemm.c. + - Various whitespace/comment changes. + +commit a617301f9365ac720ff286514105d1b78951368b +Author: Field G. Van Zee +Date: Tue Oct 8 17:14:05 2019 -0500 + + Updates to docs/CodingConventions.md. + +commit 171f10069199f0cd280f18aac184546bd877c4fe +Merge: 702486b1 05d58edf +Author: Field G. Van Zee +Date: Fri Oct 4 11:18:23 2019 -0500 + + Merge remote-tracking branch 'loveshack/emacs' + +commit 702486b12560b5c696ba06de9a73fc0d5107ca44 +Author: Field G. Van Zee +Date: Wed Oct 2 16:35:41 2019 -0500 + + Removed stray FAQ section introduced in 1907000. + +commit 1907000ad6ea396970c010f07ae42980b7b14fa0 +Author: Field G. Van Zee +Date: Wed Oct 2 16:31:54 2019 -0500 + + Updated to FAQ (AMD-related questions). + + Details: + - Added a couple potential frequently-asked questions/answers releated + to AMD's fork of BLIS. + - Updated existing answers to other questions. + +commit 834f30a0dad808931c9d80bd5831b636ed0e1098 +Author: Field G. Van Zee +Date: Wed Oct 2 12:45:56 2019 -0500 + + Mention mixeddt paper in docs/MixedDatatypes.md. + +commit 05d58edfe0ea9279971d74f17a5f7a69c4672ed5 +Author: Dave Love +Date: Wed Oct 2 10:33:44 2019 +0100 + + Note .dir-locals.el in docs + +commit 531110c339f199a4d165d707c988d89ab4f5bfe8 +Author: Dave Love +Date: Wed Oct 2 10:16:22 2019 +0100 + + Modify Emacs config + Confine it to cc-mode and add comment-start/end. + +commit 4bab365cab98202259c70feba6ec87408cba28d8 +Author: Dave Love +Date: Tue Oct 1 19:22:47 2019 +0000 + + Add .dir-locals.el for Emacs (#348) + + A minimal version that could probably do with extending, but at least + gets the indentation roughly right. + +commit 4ec8dad66b3d37b0a2b47d19b7144bb62d332622 +Author: Dave Love +Date: Thu Sep 26 16:27:53 2019 +0100 + + Add .dir-locals.el for Emacs + + A minimal version that could probably do with extending, but at least + gets the indentation roughly right. + +commit bc16ec7d1e2a30ce4a751255b70c9cbe87409e4f +Author: Field G. Van Zee +Date: Mon Sep 23 15:37:33 2019 -0500 + + Set execute bits of shared library at install-time. + + Details: + - Modified the 0644 octal code used during installation of shared + libraries to 0755 (for Linux/OSX only). Thanks to Adam J. Stewart + for reporting this issue via #343. + - CREDITS file update. + +commit c60db26aee9e7b4e5d0b031b0881e58d23666b53 +Author: Field G. Van Zee +Date: Tue Sep 17 18:04:17 2019 -0500 + + Fixed bad loop counter in bli_[cz]scal2bbs_mxn(). + + Details: + - Fixed a typo in the loop counter for the 'd' (duplication) dimension + in the complex macros of frame/include/level0/bb/bli_scal2bbs_mxn.h. + They shouldn't be used by anyone yet, but thankfully clang via + AppVeyor spit out warnings that alerted me to the issue. + +commit c766c81d628f0451d8255bf5e4b8be0a4ef91978 +Author: Field G. Van Zee +Date: Tue Sep 17 18:00:29 2019 -0500 + + Added missing schema arg to knl packm kernels. + + Details: + - Added the pack_t schema argument to the knl packm kernel functions. + This change was intended for inclusion in 31c8657. (Thank you SDE + + Travis CI.) + +commit 31c8657f1d6d8f6efd8a73fd1995e995fc56748b +Author: Field G. Van Zee +Date: Tue Sep 17 17:42:10 2019 -0500 + + Added support for pre-broadcast when packing B. + + Details: + - Added support for being able to duplicate (broadcast) elements in + memory when packing matrix B (ie: the left-hand operand) in level-3 + operations. This turns out advantageous for some architectures that + can afford the cost of the extra bandwidth and somehow benefit from + the pre-broadcast elements (and thus being able to avoid using + broadcast-style load instructions on micro-rows of B in the gemm + microkernel). + - Support optionally disabling right-side hemm and symm. If this occurs, + hemm_r is implemented in terms of hemm_l (and symm_r in terms of + symm_l). This is needed when broadcasting during packing because the + alternative--supporting the broadcast of B while also allowing matrix + B to be Hermitian/symmetric--would be an absolute mess. + - Support alignment factors for packed blocks of A, B, and C separately + (as well as for general-purpose buffers). In addition, we support + byte offsets from those alignment values (which is different from + aligning by align+offset bytes to begin with). The default alignment + values are BLIS_PAGE_SIZE in all four cases, with the offset values + defaulting to zero. + - Pass pack_t schema into bli_?packm_cxk() so that it can be then passed + into the packm kernel, where it will be needed by packm kernels that + perform broadcasts of B, since the idea is that we *only* want to + broadcast when packing micropanels of B and not A. + - Added definition for variadic bli_cntx_set_l3_vir_ukrs(), which can be + used to set custom virtual level-3 microkernels in the cntx_t, which + would typically be done in the bli_cntx_init_*() function defined in + the subconfiguration of interest. + - Added a "broadcast B" kernel function for use with NP/NR = 12/6, + defined in in ref_kernels/1m/bli_packm_cxk_bb_ref.c. + - Added a gemm, gemmtrsm, and trsm "broadcast B" reference kernels + defined in ref_kernels/3/bb. (These kernels have been tested with + double real with NP/NR = 12/6.) + - Added #ifndef ... #endif guards around several macro constants defined + in frame/include/bli_kernel_macro_defs.h. + - Defined a few "broadcast B" static functions in + frame/include/level0/bb for use by "broadcast B"-style packm reference + kernels. For now, only the real domain kernels are tested and fully + defined. + - Output the alignment and offset values for packed blocks of A and B + in the testsuite's "BLIS configuration info" section. + - Comment updates to various files. + - Bumped so_version to 3.0.0. + +commit fd9bf497cd4ff73ccdfc030ba037b3cb2f1c2fad +Author: Field G. Van Zee +Date: Tue Sep 17 15:45:24 2019 -0500 + + CREDITS file update. + +commit 6c8f2d1486ce31ad3c2083e5c2035acfd4409a43 +Author: ShmuelLevine +Date: Tue Sep 17 16:43:46 2019 -0400 + + Fix description for function bli_*pxby2v (#340) + + Fix typo in BLISTypedAPI.md for bli_?axpy2v() description. + +commit b5679c1520f8ae7637b3cc2313133461f62398dc +Author: Field G. Van Zee +Date: Tue Sep 17 14:00:37 2019 -0500 + + Inserted Multithreading links into BuildSystem.md. + + Details: + - Inserted brief disclaimers about default disabled multithreading + and default single-threadedness to BuildSystem.md along with links to + the Multithreading.md document. Thanks to Jeff Diamond for suggesting + these additions. + - Trivial reword of sentence regarding automatically-detected + architectures. + +commit f4f5170f8482c94132832eb3033bc8796da5420b +Author: Isuru Fernando +Date: Wed Sep 11 07:34:48 2019 -0500 + + Update README.md (#338) + +commit 1cfe8e2562e5e50769468382626ce36b734741c1 +Author: Field G. Van Zee +Date: Thu Sep 5 16:08:30 2019 -0500 + + Reimplemented bli_cpuid_query() for ARM. + + Details: + - Rewrote bli_cpuid_query() for ARM architectures to use stdio-based + functions such as fopen() and fgets() instead of popen(). The new code + does more or less the same thing as before--searches /proc/cpuinfo for + various strings, which are then parsed in order to determine the + model, part number, and features. Thanks to Dave Love for suggesting + this change in issue #335. + +commit 7c7819145740e96929466a248d6375d40e397e19 +Author: Devin Matthews +Date: Fri Aug 30 16:52:09 2019 -0500 + + Always use sqsumv to compute normfv. (#334) + + * Always use sqsumv to compute normfv on MacOS. + + * Unconditionally disable the "dot trick" in normfv. + + * Added explanatory comment to normfv definition. + + Details: + - Added a comment above the unconditional disabling of the dotv-based + implementation to normfv. Thanks to Roman Yurchak, Devin Matthews, + and Isuru Fernando in helping with this improvement. + - CREDITS file update. + +commit 80e6c10b72d50863b4b64d79f784df7befedfcd1 +Author: Field G. Van Zee +Date: Thu Aug 29 12:12:08 2019 -0500 + + Added reproduction section to Performance docs. + + Details: + - Added section titled "Reproduction" to both Performance.md and + PerformanceSmall.md that briefly nudges the motivated reader in the + right direction if he/she wishes to run the same performance + benchmarks used to produce the graphs shown in those documents. + Thanks to Dave Love for making this suggestion. + +commit 14cb426414856024b9ae0f84ac21efcc1d329467 +Author: Field G. Van Zee +Date: Wed Aug 28 17:04:33 2019 -0500 + + Updated OpenBLAS, Eigen sup results. + + Details: + - Updated the results shown in docs/PerformanceSmall.md for OpenBLAS and + Eigen. + +commit b02e0aae8ce2705e91023b98ed416cd05430a78e +Author: Field G. Van Zee +Date: Tue Aug 27 14:37:46 2019 -0500 + + Updated test drivers to iterate backwards. + + Details: + - Updated test driver source in test, test/3, test/1m4m, and + test/mixeddt to iterate through the problem space backwards. This + can help avoid certain situations where the CPU frequency does not + immediately throttle up to its maximum. Thanks to Robert van de + Geijn for recommending this fix (originally made to test/sup drivers + in 57e422a). + - Applied off-by-one matlab output bugfix from b6017e5 to test drivers + in test, test/3, test/1m4m, and test/mixeddt directories. + +commit b6017e53f4b26c99b14cdaa408351f11322b1e80 +Author: Field G. Van Zee +Date: Tue Aug 27 14:18:14 2019 -0500 + + Bugfix of output text + tweaks to test/sup driver. + + Details: + - Fixed an off-by-one bug in the output of matlab row indices in + test/sup/test_gemm.c that only manifested when the problem size + increment was equal to 1. + - Disabled the building of rrc, rcr, rcc, crr, crc, and ccr storage + combinations for blissup drivers in test/sup. This helps make the + building of drivers complete sooner. + - Trivial changes to test/sup/runme.sh. + +commit 138d403b6bb15e687a3fe26d3d967b8ccd1ed97b +Author: Devin Matthews +Date: Mon Aug 26 18:11:27 2019 -0500 + + Use -funsafe-math-optimizations and -ffp-contract=fast for all reference kernels when using gcc or clang. (#331) + +commit d5a05a15a7fcc38fb2519031dcc62de8ea4a530c +Author: Field G. Van Zee +Date: Mon Aug 26 16:54:31 2019 -0500 + + Cropped whitespace from new sup graphs. + + Details: + - Previously forgot crop whitespace from the new .png graphs + added/updated in docs/graphs/sup. + +commit a6c80171a353db709e43f9e6e7a3da87ce4d17ed +Author: Field G. Van Zee +Date: Mon Aug 26 16:51:31 2019 -0500 + + Fixed contents links in docs/PerformanceSmall.md. + + Details: + - Corrected links in contents section of docs/PerformanceSmall.md, + which were erroneously directing readers to the corresponding + sections of docs/Performance.md. + +commit 40781774df56a912144ef19cc191ed626a89f0de +Author: Field G. Van Zee +Date: Mon Aug 26 16:47:37 2019 -0500 + + Updated sup performance graphs with libxsmm. + + Details: + - Added libxsmm to column-stored sup graphs presented in + docs/PerformanceSmall.md. + - Updated sup results for BLASFEO. + - Added sup results for Lonestar5 (Haswell). + - Addresses issue #326. + +commit bfddf671328e7e372ac7228f72ff2d9d8e03ae18 +Author: figual +Date: Mon Aug 26 12:01:33 2019 +0200 + + Fixed context registration for Cortex A53 (#329). + +commit 4a0a6e89c568246d14de4cc30e3ff35aac23d774 +Author: Field G. Van Zee +Date: Sat Aug 24 15:25:16 2019 -0500 + + Changed test/sup alpha to 1; test libxsmm+netlib. + + Details: + - Changed the value of alpha to 1.0 in test/sup/test_gemm.c. This is + needed because libxsmm currently only optimizes gemm operations where + alpha is unit (and beta is unit or zero). + - Adjusted the test/sup/Makefile to test libxsmm with netlib BLAS as its + fallback library. This is the library that will be called the + problem dimensions are deemed too large, or any other criteria for + optimization are not met. (This was done not because it is realistic, + but rather so that it would be very clear when libxsmm ceased handling + gemm calls internally when the data are graphed.) + +commit 7aa52b57832176c5c13a48e30a282e09ecdabf73 +Author: Field G. Van Zee +Date: Fri Aug 23 16:12:50 2019 -0500 + + Use libxsmm API in test/sup; add missing -ldl. + + Details: + - Switch the driver source in test/sup so that libxsmm_?gemm() is called + instead of ?gemm_() when compiling for / linking against libxsmm. + libxsmm's documentation isn't clear on whether it is even *trying* to + provide BLAS API compatibility, and I got tired of trying to figure it + out. + - Added missing -ldl in LDFLAGS when linking against libxsmm. + +commit 57e422aa168bee7416965265c93fcd4934cd7041 +Author: Field G. Van Zee +Date: Fri Aug 23 14:17:52 2019 -0500 + + Added libxsmm support to test/sup drivers. + + Details: + - Modified test/sup/Makefile to build drivers that test the performance + of skinny/small problems via libxsmm. + - Modified test/sup/runme.sh to run aforementioned drivers. + - Modified test/sup/test_gemm.c so that problem sizes are tested in + reverse order (from largest to smallest). This can help avoid certain + situations where the CPU frequency does not immediately throttle up + to its maximum. Thanks to Robert van de Geijn for recommending this + fix. + +commit 661681fe33978acce370255815c76348f83632bc +Merge: 2f387e32 ef0a1a0f +Author: Field G. Van Zee +Date: Thu Aug 22 14:29:50 2019 -0500 + + Merge branch 'master' of github.com:flame/blis + +commit 2f387e32ef5f9a17bafb5076dc9f66c38b52b32d +Author: Field G. Van Zee +Date: Thu Aug 22 14:27:30 2019 -0500 + + Added Eigen -march=native hack to perf docs. + + Details: + - Spell out the hack given to me by Sameer Agarwal in order to get Eigen + to build with -march=native (which is critically important for Eigen) + in docs/Performance.md and docs/PerformanceSmall.md. + +commit ef0a1a0faf683fe205f85308a54a77ffd68a9a6c +Author: Devin Matthews +Date: Wed Aug 21 17:40:24 2019 -0500 + + Update do_sde.sh (#330) + + * Update do_sde.sh + + Automatically accept SDE license and download directly from Intel + + * Update .travis.yml + + [ci skip] + + * Update .travis.yml + + Enable SDE testing for PRs. + +commit 0cd383d53a8c4a6871892a0395591ef5630d4ac0 +Author: Field G. Van Zee +Date: Wed Aug 21 13:39:05 2019 -0500 + + Corrected variable type and comment update. + + Details: + - Forgot to save all changes from bli_gemmtrsm4m1_ref.c before commit + in 8122f59. Fixed type mismatch and referenced github issue in + comment. + +commit 8122f59745db780987da6aa1e851e9e76aa985e0 +Author: Field G. Van Zee +Date: Wed Aug 21 13:22:12 2019 -0500 + + Pacify 'restrict' warning in gemmtrsm4m1 ref ukr. + + Details: + - Previously, some versions of gcc would complain that the same + pointer, one_r, is being passed in for both alpha and beta in the + fourth call to the real gemm ukernel in bli_gemmtrsm4m1_ref.c. This + is understandable since the compiler knows that the real gemm ukernel + qualifies all of its floating-point arguments (including alpha and + beta) with restrict. A small hack has been inserted into the file + that defines a new variable to store the value 1.0, which is now used + in lieu of one_r for beta in the fourth call to the real gemm ukernel, + which should pacify the compiler now. Thanks to Dave Love for + reporting this issue (#328) and for Devin Matthews for offering his + 'restrict' expertise. + +commit e8c6281f139bdfc9bd68c3b36e5e89059b0ead2e +Author: Field G. Van Zee +Date: Wed Aug 21 12:38:53 2019 -0500 + + Add -march support for specific gcc version ranges. + + Details: + - Added logic to configure that checks the version of the compiler + against known version ranges that could cause problems later in the + build process. For example, versions of gcc older than 4.9.0 use + different -march labels than version 4.9.0 or later + ('-march=corei7-avx' vs '-march=sandybridge', respectively). + Similarly, before 6.1, compilation on Zen was possible, but you + need to start with -march=bdver4 and then disable instruction sets + that were discarded during the transition from Excavator to Zen. So + now, configure substitutes 'yes'/'no' values into anchors in + config.mk.in, which sets various make variables (e.g. GCC_OT_4_9_0), + which can be accessed and branched upon by the various + configurations' make_defs.mk files when setting their compiler flags. + - Updated config/haswell/make_defs.mk to branch on GCC_OT_4_9_0. + - Updated config/sandybridge/make_defs.mk to branch on GCC_OT_4_9_0. + - Updated config/zen/make_defs.mk to branch on GCC_OT_6_1_0. + +commit e6ac4ebcb6e6a372820e7f509c0af3342966b84a +Author: Field G. Van Zee +Date: Tue Aug 20 13:49:47 2019 -0500 + + Added page size, source location to perf docs. + + Details: + - Added the page size, as returned via 'getconf -a | grep PAGE_SIZE', + and the location of the performance drivers to docs/Performance.md + (test/3) and docs/PerformanceSmall.md (test/sup). Thanks to Dave + Love for suggesting these additions in #325. + +commit fdce1a5648d69034fab39943100289323011c36f +Author: Meghana +Date: Wed Jul 24 15:04:41 2019 +0530 + + changed gcc version check condition from 'ifeq' to 'if greater or equal' + + Change-Id: Ie4c461867829bcc113210791bbefb9517e52c226 + +commit c9486e0c4f82cd9f58f5ceb71c0df039e9970a20 +Author: Meghana +Date: Wed Jul 24 09:45:17 2019 +0530 + + code to detect version of gcc and set flags accordingly for zen2 + + Change-Id: I29b0311d0000dee1a2533ee29941acf53f9e9f34 + +commit 54afe3dfe6828a1aff65baabbf14c98d92e50692 +Author: Field G. Van Zee +Date: Tue Jul 23 16:54:28 2019 -0500 + + Added "Education and Learning" ToC entry to README. + +commit 9f53b1ce7ac702e84e71801fe96986f6aa16040e +Author: Field G. Van Zee +Date: Tue Jul 23 16:50:35 2019 -0500 + + Added "Education and Learning" section to README. + + Details: + - Added a short section after the Intro of the README.md file titled + "Education and Learning" that directs interested readers to the + "LAFF-On Programming for High-Performance" massive open online course + (MOOC) hosted via edX. + +commit deda4ca8a094ee18d7c7c45e040e8ef180f33a48 +Author: Field G. Van Zee +Date: Mon Jul 22 13:59:05 2019 -0500 + + Added test/1m4m driver directory. + + Details: + - Added a new standalone test driver directory named '1m4m' that can + build and run performance experiments for BLIS 1m, 4m1a, assembly, + OpenBLAS, and the vendor library (MKL). This new driver directory + was used to regenerate performance results for the 1m paper. + - Added alternate (commented-out) cache blocksizes to + config/haswell/bli_cntx_init_haswell.c. These blocksizes tend to + work well on an a 12-core Intel Xeon E5-2650 v3. + +commit dcc0ce12fde4c6dca2b4764a1922a2ab19725867 +Author: Meghana +Date: Mon Jul 22 17:12:01 2019 +0530 + + Added a global Makefile for AMD architectures in config/zen folder + This Makefile(amd_config.mk) has all the flags that are common to EPYC series + + Change-Id: Ic02c60a8293ccdd37f0f292e631acd198e6895de + +commit af17bca26a8bd3dcbee8ca81c18d7b25de09c483 +Author: Field G. Van Zee +Date: Fri Jul 19 14:46:23 2019 -0500 + + Updated haswell MC cache blocksizes. + + Details: + - Updated the default MC cache blocksizes used by the haswell subconfig + for both row-preferential (the default) and column-preferential + microkernels. + +commit b5e9bce4dde5bf014dd9771ae741048e1f6c7748 +Author: Field G. Van Zee +Date: Fri Jul 19 14:42:37 2019 -0500 + + Updated -march flags for sandybridge, haswell. + + Details: + - Updated the '-march=corei7-avx' flag in the sandybridge subconfig + to '-march=sandybridge' and the '-march=core-avx2' flag in the + haswell subconfig to '-march=haswell'. The older flags were used + by older versions of gcc and should have been updated to the newer + forms a long time ago. (The older flags were clearly working, even + though they are no longer documented in the gcc man page.) + +commit c22b9dba5859a9fc94c8431eccc9e4eb9be02be1 +Author: Field G. Van Zee +Date: Tue Jul 16 13:14:47 2019 -0500 + + More updates to comments in testsuite modules. + + Details: + - Updated most comments in testsuite modules that describe how the + correctness test is performed so that it is clear whether the vector + (normfv) or matrix (normfm) form of Frobenius norm is used. + +commit c4cc6fa702f444a05963db01db51bc7d6669e979 +Author: Field G. Van Zee +Date: Tue Jul 16 13:00:35 2019 -0500 + + New cntx_t blksz "set" functions + misc tweaks. + + Details: + - Defined two new static functions in bli_cntx.h: + bli_cntx_set_blksz_def_dt() + bli_cntx_set_blksz_max_dt() + which developers may find convenient when experimenting with different + values of cache blocksizes. + - Updated one- and two-socket multithreaded problem size range and + increment values in test/3/Makefile. + - Changed default to column storage in test/3/test_gemm.c. + - Fixed typo in comment in testsuite/src/test_subm.c. + +commit b84cee29f42855dc1f263e42b83b1a46ac8def87 +Merge: 1f80858a c7dd6e6c +Author: Meghana Vankadari +Date: Mon Jul 8 02:03:07 2019 -0400 + + Merge "Added compiler flags for vanilla clang" into amd-staging-rome2.0 + +commit 1f80858abf5ca220b2998fbe6f9b06c32d3864c3 +Author: kdevraje +Date: Fri Jul 5 16:05:11 2019 +0530 + + This checkin solves the dgemm performance issue jira ticket CPUPL 458, as #else was missed during integration, it was always following else path to get the block sizes + + Change-Id: I0084b5856c2513ab1066c08c15b5086db6532717 + +commit c7dd6e6cd2f910cbefcdc1e04a5adeb919a23de0 +Author: Meghana +Date: Thu Jul 4 09:32:51 2019 +0530 + + Added compiler flags for vanilla clang + + Change-Id: I13c00b4c0d65bbda4c929848fd48b0ab611952ab + +commit 2acd49b76457635625a01e31c2abc8902b23cf51 +Author: Meghana +Date: Mon Jul 1 15:42:38 2019 +0530 + + fix for test failures using AOCC 2.0 + + Change-Id: If44eaccc64bbe96bbbe1d32279b1b5773aba08d1 + +commit ceee2f973ebe115beca55ca77f9e3ce36b14c28a +Author: Field G. Van Zee +Date: Mon Jun 24 17:47:40 2019 -0500 + + Fixed thrinfo_t printing bug for small problems. + + Details: + - Fixed a bug in bli_l3_thrinfo_print_gemm_paths() and + bli_l3_thrinfo_print_trsm_paths(), defined in bli_l3_thrinfo.c, + whereby subnodes of the thrinfo_t tree are "dereferenced" near the + beginning of the functions, which may lead to segfaults in certain + situations where the thread tree was not fully formed because the + matrix problem was too small for the level of parallelism specified. + (That is, too small because some problems were assigned no work due + to the smallest units in the m and n dimensions being defined by the + register blocksizes mr and nr.) The fix requires several nested levels + of if statements, and this is one of those few instances where use of + goto statements results in (mostly) prettier code, especially in the + case of _gemm_paths(). And while it wasn't necessary, I ported this + goto usage to the loop body that prints the thrinfo_t work_id and + comm_id values for each thread. Thanks to Nicholai Tukanov for helping + to find this bug. + +commit cac127182dd88ed0394ad81e6b91b897198e168a +Merge: 565fa385 3a45ecb1 +Author: kdevraje +Date: Mon Jun 24 13:01:27 2019 +0530 + + Merge branch 'amd-staging-rome2.0' of ssh://git.amd.com:29418/cpulibraries/er/blis + with public repo commit id 565fa3853b381051ac92cff764625909d105644d. + + Change-Id: I68b9824b110cf14df248217a24a6191b3df79d42 + +commit c152109e9a3b1cd74760e8a3215a676d25c18d2e +Author: Field G. Van Zee +Date: Wed Jun 19 13:23:24 2019 -0500 + + Updated BLASFEO results in PerformanceSmall.md. + + Details: + - Updated the BLASFEO performance graphs shown in PerformanceSmall.md + using a new commit of BLASFEO (2c9f312); updated PerformanceSmall.md + accordingly. + - Updated test/sup/octave/plot_l3sup_perf.m so that the .m files + containing the mpnpkp results do not need to be preprocessed in order + to plot half the problem size range (ie: up to 400 instead of the + 800 range of the other shape cases). + - Trivial updates to runme.m. + +commit 4d19c98110691d33ecef09d7e1b97bd1ccf4c420 +Author: Field G. Van Zee +Date: Sat Jun 8 11:02:03 2019 -0500 + + Trivial change to MixedDatatypes.md link text. + +commit 24965beabe83e19acf62008366097a7f198d4841 +Author: Field G. Van Zee +Date: Sat Jun 8 11:00:22 2019 -0500 + + Fixed typo in README.md's MixedDatatypes.md link. + +commit 50dc5d95760f41c5117c46f754245edc642b2179 +Author: Field G. Van Zee +Date: Fri Jun 7 13:10:16 2019 -0500 + + Adjust -fopenmp-simd for icc's preferred syntax. + + Details: + - Use -qopenmp-simd instead of -fopenmp-simd when compiling with Intel + icc. Recall that this option is used for SIMD auto-vectorization in + reference kernels only. Support for the -f option has been completely + deprecated and removed in newer versions of icc in favor of -q. Thanks + to Victor Eijkhout for reporting this issue and suggesting the fix. + +commit ad937db9507786874c801b41a4992aef42d924a1 +Author: Field G. Van Zee +Date: Fri Jun 7 11:34:08 2019 -0500 + + Added missing #include "bli_family_thunderx2.h". + + Details: + - Added a cpp-conditional directive block to bli_arch_config.h that + #includes "bli_family_thunderx2.h". The code has been missing since + adf5c17f. However, this never manifested as an error because the file + is virtually empty and not needed for thunderx2 (or most subconfigs). + Thanks to Jeff Diamond for helping to spot this. + +commit ce671917b2bc24895289247feef46f6fdd5020e7 +Author: Field G. Van Zee +Date: Thu Jun 6 14:17:21 2019 -0500 + + Fixed formatting/typo in docs/PerformanceSmall.md. + +commit 86c33a4eb284e2cf3282a1809be377785cdb3703 +Author: Field G. Van Zee +Date: Wed Jun 5 11:43:55 2019 -0500 + + Tweaked language in README.md related to sup/AMD. + +commit cbaa22e1ca368d36a8510f2b4ecd6f1523d1e1f3 +Author: Field G. Van Zee +Date: Tue Jun 4 16:06:58 2019 -0500 + + Added BLASFEO results to docs/PerformanceSmall.md. + + Details: + - Updated the graphs linked in PerformanceSmall.md with BLASFEO results, + and added documenting language accordingly. + - Updated scripts in test/sup/octave to plot BLASFEO data. + - Minor tweak to language re: how OpenBLAS was configured for + docs/Performance.md. + +commit 763fa39c3088c0e2c0155675a3ca868a58bffb30 +Author: Field G. Van Zee +Date: Tue Jun 4 14:46:45 2019 -0500 + + Minor tweaks to test/sup. + + Details: + - Changed starting problem and increment from 16 to 4. + - Added 'lll' (square problems) to list of problem size shapes to + compile and run with. + - Define BLASFEO location and added BLASFEO-related definitions. + +commit 5e1e696003c9151b1879b910a1957b7bdd7b0deb +Author: Field G. Van Zee +Date: Mon Jun 3 18:37:20 2019 -0500 + + CHANGELOG update (0.6.0) + +commit 18c876b989fd0dcaa27becd14e4f16bdac7e89b3 (tag: 0.6.0) Author: Field G. Van Zee Date: Mon Jun 3 18:37:19 2019 -0500 Version file update (0.6.0) -commit 0f1b3bf49eb593ca7bb08b68a7209f7cd550f912 (origin/master, origin/HEAD) +commit 0f1b3bf49eb593ca7bb08b68a7209f7cd550f912 Author: Field G. Van Zee Date: Mon Jun 3 18:35:19 2019 -0500 @@ -50,7 +1820,7 @@ Date: Fri May 31 17:42:40 2019 -0500 Merge branch 'amd' -commit a4e8801d08d81fa42ebea6a05a990de8dcedc803 (origin/amd, amd) +commit a4e8801d08d81fa42ebea6a05a990de8dcedc803 Author: Field G. Van Zee Date: Fri May 31 17:30:51 2019 -0500 @@ -63,6 +1833,29 @@ Date: Fri May 31 17:30:51 2019 -0500 - Updated octave scripts in test/sup/octave to include a seventh column to display performance for m = n = k. +commit 3a45ecb15456249c30ccccd60e42152f355615c1 +Merge: 3f867c96 b69fb0b7 +Author: Kiran Devrajegowda +Date: Fri May 31 06:47:02 2019 -0400 + + Merge "Added back BLIS_ENABLE_ZEN_BLOCK_SIZES macro to zen configuration, this is same as release 1.3. This was added before to improve DGEMM Multithreaded scalability on Naples for when number of threads is greater than 16. By mistake this got deleted in many changes done for 2.0 release, now we are adding this change back., in bli_gemm_front.c - code cleanup" into amd-staging-rome2.0 + +commit b69fb0b74a4756168de270fc9b18f7cf7aa57f17 +Author: Kiran Varaganti +Date: Fri May 31 15:14:22 2019 +0530 + + Added back BLIS_ENABLE_ZEN_BLOCK_SIZES macro to zen configuration, this is same as release 1.3. This was added before to improve DGEMM Multithreaded scalability on Naples for when number of threads is greater than 16. By mistake this got deleted in many changes done for 2.0 release, now we are adding this change back., in bli_gemm_front.c - code cleanup + + Change-Id: I9f5d8225254676a99c6f2b09a0825e545206d0fc + +commit 3f867c96caea3bbbbeeff1995d90f6cf8c9895fb +Author: kdevraje +Date: Fri May 31 12:22:44 2019 +0530 + + When running HPL with pure MPI without DGEMM Threading (Single Threaded BLIS ), making this macro 1 gives best performance.wq + + Change-Id: I24fd0bf99216f315e49f1c74c44c3feaffd7078d + commit abd8a9fa7df4569aa2711964c19888b8e248901f (origin/pfhp) Author: Field G. Van Zee Date: Tue May 28 12:49:44 2019 -0500 @@ -92,6 +1885,31 @@ Date: Tue May 28 12:49:44 2019 -0500 - Fixed typo in entry for --export-shared flag in 'configure --help' text. +commit 13806ba3b01ca0dd341f4720fb930f97e46710b0 +Author: kdevraje +Date: Mon May 27 16:24:43 2019 +0530 + + This check in has changes w.r.t Copyright information, which is changed to (start year) - 2019 + + Change-Id: Ide3c8f7172210b8d3538d3c36e88634ab1ba9041 + +commit ee123f535872510f77100d3d55a43d4ca56047d5 +Author: Meghana +Date: Mon May 27 15:36:44 2019 +0530 + + Defined small matrix thresholds for TRSM for various cases for NAPLES and ROME + Updated copyright information for kernels/zen/bli_trsm_small.c file + Removed separate kernels for zen2 architecture + Instead added threshold conditions in zen kernels both for ROME and NAPLES + + Change-Id: Ifd715731741d649b6ad16b123a86dbd6665d97e5 + +commit 9d93a4caa21402d3a90aac45d7a1603736c9fd63 +Author: prangana +Date: Fri May 24 17:59:13 2019 +0530 + + update version 2.0 + commit 755730608d923538273a90c48bfdf77571f86519 Author: Field G. Van Zee Date: Thu May 23 17:34:36 2019 -0500 @@ -139,6 +1957,117 @@ Date: Thu May 23 12:51:17 2019 -0500 - Whitespace and inconsequential quoting change to configure. - Moved top-level 'windows' directory into a new 'attic' directory. +commit e05171118c377f356f89c4daf8a0d5ddc5a4e4f7 +Author: Meghana +Date: Thu May 23 16:15:27 2019 +0530 + + Implemented TRSM for small matrices for cases where A is on the right + + Added separate kernels for zen and zen2 + + Change-Id: I6318ddc250cf82516c1aa4732718a35eae0c9134 + +commit 02920f5c480c42706b487e37b5ecc96c3555b851 +Author: kdevraje +Date: Thu May 23 15:29:59 2019 +0530 + + make checkblis fails for matrix dimension check at the begining hence reverting it + + Change-Id: Ibd2ee8c2d4914598b72003fbfc5845be9c9c1e87 + +commit 84215022f29fb3bfedd254d041635308d177e6c0 +Author: kdevraje +Date: Thu May 23 11:08:41 2019 +0530 + + Adding threshold condition to dgemm small matrix kernels, defining the constants in zen2 configuration + + Change-Id: I53a58b5d734925a6fcb8d8bea5a02ddb8971fcd5 + +commit a3554eb1dcc1b5b94d81c60761b2f01c3d827ffa +Merge: ea082f83 17b878b6 +Author: kdevraje +Date: Thu May 23 11:51:07 2019 +0530 + + Merge branch 'amd-staging-rome2.0' of ssh://git.amd.com:29418/cpulibraries/er/blis to configure zen2 + + Change-Id: I97e17bca9716b80b862925f97bb513c07b4b0cae + +commit ea082f839071dd9ec555062dc3851c31d12f00e4 +Author: kdevraje +Date: Thu May 23 10:38:29 2019 +0530 + + adding empty zen2 directory with .gitignore file + + Change-Id: Ifa37cf54b2578aa19ad335372b44bca17043fe4b + +commit b80bd5bcb2be8551a9a21fafc8e6c8b6336c99b5 +Author: Kiran Varaganti +Date: Tue May 21 15:11:47 2019 +0530 + + config/zen/bli_cntx_init_zen.c: removed BLIS_ENBLE_ZEN_BLOCK_SIZES macro. We have different configurations for both zen and zen2 + config/zen/bli_family_zen.h: deleted macro BLIS_ENBLE_ZEN_BLOCK_SIZES + config/zen/make_defs.mk: removed compiler flag -mno-avx256-split-unaligned-store + frame/base/bli_cpuid.c: ROME family is 17H but model # is from 0x30H. + test/test_gemm.c - commented out #define FILE_IN_OUT (some compilation error when BLIS is configured as amd64) + Now we can use single configuration has ./configure amd64 - this will work both for ROME & Naples + + Change-Id: I91b4fc35380f8a35b4f4c345da040c6b5910b4a2 + +commit a042db011df9a1c3e7c7ac546541f4746b176ea5 +Author: Kiran Varaganti +Date: Mon May 20 14:17:32 2019 +0530 + + Modified make_defs.mk for zen2 to get compiled by gcc version less than gcc9.0 + + Change-Id: I8fcac30538ee39534c296932639053b47b9a2d43 + +commit a23f92594cf3d530e5794307fe97afc877d853b7 +Author: Kiran Varaganti +Date: Mon May 20 10:48:06 2019 +0530 + + config_registry: New AMD zen2 architecture configuration added. + frame/base/bli_arch.c: #ifdef BLIS_FAMILY_ZEN2 id = BLIS_ARCH_ZEN2; #endif added. zen2 is added in config_name[BLIS_NUM_ARCHS] + frame/base/bli_cpuid.c : #ifdef BLIS_CONFIG_ZEN2 if ( bli_cpuid_is_zen2( family, model, features ) ) return BLIS_ARCH_ZEN2; #endif, defined new function bool bli_cpuid_is_zen2(...). + frame/base/bli_cpuid.h : declared bli_cpuid_is_zen2(..). + frame/base/bli_gks.c : #ifdef BLIS_CONFIG_ZEN2 bli_gks_register_cntx(BLIS_ARCH_ZEN2, bli_cntx_init_zen2, bli_cntx_init_zen2_ref, bli_cntx_init_zen2_ind); #endif + frame/include/bli_arch_config.h : #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS(zen2) #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" #endif + frame/include/bli_type_defs.h : added BLIS_ARCH_ZEN2 in arch_t enum. BLIS_NUM_ARCHS 20 + + Change-Id: I2a2d9b7266673e78a4f8543b1bfb5425b0aa7866 + +commit 17b878b66d917d50b6fe23721d8579e826cb3e8c +Author: kdevraje +Date: Wed May 22 14:02:53 2019 +0530 + + adding license same as in ut-austin-amd-branch + + Change-Id: I6790768d2bf5d42369d304ef93e34701f95fbaff + +commit df755848b8a271323e007c7a628c64af63deab00 +Merge: ca4b33c0 c72ae27a +Author: kdevraje +Date: Wed May 22 13:30:07 2019 +0530 + + Merge branch 'amd-staging-rome2.0' of ssh://git.amd.com:29418/cpulibraries/er/blis into rome2.0 + + Change-Id: Ie8aad1ab810f0f3c0b90ec67f9dd3dfb8dcc74cc + +commit c72ae27adee4726679ee004d02c972582b5285b4 +Author: Nisanth M P +Date: Mon Mar 19 12:49:26 2018 +0530 + + Re-enabling the small matrix gemm optimization for target zen + + Change-Id: I13872784586984634d728cd99a00f71c3f904395 + +commit ab0818af80f7f683080873f3fa24734b65267df2 +Author: sraut +Date: Wed Oct 3 15:30:33 2018 +0530 + + Review comments incorporated for small TRSM. + + Change-Id: Ia64b7b2c0375cc501c2cb0be8a1af93111808cd9 + commit 32392cfc72af7f42da817a129748349fb1951346 Author: Jeff Hammond Date: Tue May 14 15:52:30 2019 -0400 @@ -333,6 +2262,14 @@ Date: Sat Apr 27 22:56:02 2019 +0000 make unix friendly archives on appveyor (#310) +commit ca4b33c001f9e959c43b95a9a23f9df5adec7adf +Author: Kiran Varaganti +Date: Wed Apr 24 15:02:39 2019 +0530 + + Added compiler option (-mno-avx256-split-unaligned-store) in the file config/zen/make_defs.mk to improve performance of intrinsic codes, this flag ensures compiler generates 256-bit stores for the equivalent intrinsics code. + + Change-Id: I8f8cd81a3604869df18d38bc42097a04f178d324 + commit 945928c650051c04d6900c7f4e9e29cd0e5b299f Merge: 663f6629 74e513eb Author: Field G. Van Zee @@ -340,7 +2277,7 @@ Date: Wed Apr 17 15:58:56 2019 -0500 Merge branch 'amd' of github.com:flame/blis into amd -commit 74e513eb6a6787a925d43cd1500277d54d86ab8f (origin/dev) +commit 74e513eb6a6787a925d43cd1500277d54d86ab8f Author: Field G. Van Zee Date: Wed Apr 17 13:34:44 2019 -0500 @@ -417,6 +2354,14 @@ Date: Thu Apr 11 18:33:08 2019 -0500 32812ff. - CREDITS file update. +commit 9d76688ad90014a11ddc0c2f27253d62806216b1 +Author: kdevraje +Date: Thu Apr 11 10:22:48 2019 +0530 + + Fix for single rank crash with HPL application. When computing offset of C buffer, as integer variables are used for a row and column index, the intermediate result value overflows and a negative value gets added to the buffer, when the negative value is too large it would index the buffer out of the range resulting in segmentation fault. Although the crash is a result of dgemm kernel, added similar code in sgemm kernel also. + + Change-Id: I171119b0ec0dfbd8e63f1fcd6609a94384aabd27 + commit 32812ff5aba05d34c421fe1024a61f3e2d5e7052 Author: Field G. Van Zee Date: Tue Apr 9 12:20:19 2019 -0500 @@ -520,7 +2465,7 @@ Date: Wed Mar 27 17:58:19 2019 -0500 Merge branch 'dev' -commit 2c85e1dd9d5d84da7228ea4ae6deec56a89b3a8f (dev) +commit 2c85e1dd9d5d84da7228ea4ae6deec56a89b3a8f Author: Field G. Van Zee Date: Wed Mar 27 16:29:51 2019 -0500 @@ -634,6 +2579,22 @@ Date: Mon Mar 25 13:03:44 2019 -0500 clang 3.8 can't build knl as it doesn't recognize zmm0 +commit 53842c7e7d530cb2d5609d6d124ae350fc345c32 +Author: Kiran Varaganti +Date: Fri Mar 22 13:57:14 2019 +0530 + + Removed printing alpha and beta values + + Change-Id: I49102db510311a30f6a936f9d843f35838f50d23 + +commit 6805db45e343d83d1adaf9157cf0b841653e9ede +Author: Kiran Varaganti +Date: Fri Mar 22 12:55:35 2019 +0530 + + Corrected setting alpha & beta values- alpha = -1 and beta = 1 - bli_setc(-1.0, 0, &alpha) should be used rather than bli_setc(0.0, -1.0, &alpha). This corrected now + + Change-Id: Ic1102dfd6b50ccf212386a1211c6f31e8d987ef9 + commit feefcab4427a75b0b55af215486b85abcda314f7 Author: Field G. Van Zee Date: Thu Mar 21 18:11:20 2019 -0500 @@ -652,6 +2613,23 @@ Date: Thu Mar 21 18:11:20 2019 -0500 - Updated docs/BuildSystem.md to document the feature above, and related text. +commit 20153cd4b594bc34f860c381ec18de3a6cc743c7 +Author: Kiran Varaganti +Date: Thu Mar 21 16:23:53 2019 +0530 + + Modified test_gemm.c file in test folder + A Macro 'FILE_IN_OUT" is defined to read input parameters from a csv file. + Format for input file: + Each line defines a gemm problem with following parameters: m k n cs_a cs_b cs_c + The operation always implemented is C = C - A*B and column-major format. + When macro is disabled - it reverts back to original implementation. + Usage: ./test_gemm_.x input.csv output.csv + GEMM is called through BLAS interface + For BLIS - the test application also prints either 'S' indicating small gemm routine or 'N' - conventional BLIS gemm + for MKL/OpenBLAS - ignore this character + + Change-Id: I0924ef2c1f7bdea48d4cdb230b888e2af2c86a36 + commit 288843b06d91e1b4fade337959aef773090bd1c9 Author: Field G. Van Zee Date: Wed Mar 20 17:52:23 2019 -0500 @@ -780,6 +2758,14 @@ Date: Mon Mar 18 13:22:55 2019 -0500 user wishes to use the local runtime API (specify multithreading on a per-call basis), one of the native BLIS APIs must be used. +commit 3a929a3d0ba0353159a6d4cd188f01b7a390ccfc +Author: Kiran Varaganti +Date: Mon Mar 18 10:51:41 2019 +0530 + + Fixed code merging: bli_gemm_small.c - missed conditional checks for L!=0 && K!=0. Now they are added. This fix is done to pass blastest + + Change-Id: Idc9c9a04d2015a68a19553c437ecaf8f1584026c + commit 663f662932c3f182fefc3c77daa1bf8c3394bb8b Merge: 938c05ef 6bfe3812 Author: Field G. Van Zee @@ -936,6 +2922,14 @@ Date: Thu Mar 7 01:04:05 2019 +0000 - Very minor updates to the newly revamped test/3m4m drivers when used on a Xeon Platinum (SkylakeX). +commit 7fe44748383071f1cbbc77d904f4ae5538e13065 +Author: Kiran Varaganti +Date: Wed Mar 6 16:23:31 2019 +0530 + + Disabled BLIS_ENABLE_ZEN_BLOCK_SIZES in bli_family_zen.h for ROME tuning + + Change-Id: Iec47fcf51f4d4396afef1ce3958e58cf02c59a57 + commit 9f1dbe572b1fd5e7dd30d5649bdf59259ad770d5 Author: Field G. Van Zee Date: Tue Mar 5 17:47:55 2019 -0600 @@ -959,6 +2953,15 @@ Date: Tue Mar 5 17:47:55 2019 -0600 test drivers, and runme.sh script, and renamed 'plot_all.m' to 'runme.m'. +commit f5ed95ecd7d5eb4a63e1333ad5cc6765fc8df9fe +Author: Kiran Varaganti +Date: Tue Mar 5 15:01:57 2019 +0530 + + Merged BLIS Release 1.3 + Modified config/zen/make_defs.mk, now CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 + + Change-Id: Ia0942d285a21447cd0c470de1bc021fe63e80d81 + commit 3bdab823fa93342895bf45d812439324a37db77c Merge: 70f12f20 e2a02ebd Author: Field G. Van Zee @@ -1032,6 +3035,41 @@ Date: Fri Feb 22 16:55:30 2019 -0600 pasting function invocations into matlab to generate plots that are presently of interest to us. +commit b06244d98cc468346eb1a8eb931bc05f35ff280c +Merge: e938ff08 4c7e6680 +Author: praveeng +Date: Thu Feb 21 12:56:15 2019 +0530 + + Merge branch 'ut-austin-amd' of ssh://git.amd.com:29418/cpulibraries/er/blis into ut-austin-amd + +commit e938ff08cea3d108c84524eb129d9e89d701ea90 +Author: praveeng +Date: Thu Feb 21 12:44:38 2019 +0530 + + deleted test.txt + + Change-Id: I3871f5fe76e548bc29ec2733745b29964e829dd3 + +commit ed13ad465dcba350ad3d5e16c9cc7542e33f3760 +Author: mkv +Date: Thu Feb 21 01:04:16 2019 -0500 + + added test file for initial commit + +commit 4c7e6680832b497468cf50c2399e3ac4de0e3450 +Author: praveeng +Date: Thu Feb 21 12:44:38 2019 +0530 + + deleted test.txt + + Change-Id: I3871f5fe76e548bc29ec2733745b29964e829dd3 + +commit 95e070581c54ed2edc211874faec56055ea298c8 +Author: mkv +Date: Thu Feb 21 01:04:16 2019 -0500 + + added test file for initial commit + commit 70f12f209bc1901b5205902503707134cf2991a0 Author: Field G. Van Zee Date: Wed Feb 20 16:10:10 2019 -0600 @@ -1531,6 +3569,22 @@ Date: Mon Jan 7 12:12:47 2019 -0600 - Updated docs/BuildSystem.md to be explicit about current python2 vs python3 version requirements. +commit cdbf16aa93234e0d6a80f0d0e385ec81e7b75465 +Author: prangana +Date: Fri Jan 4 15:59:21 2019 +0530 + + Update version 1.3 + + Change-Id: I32a7d24af860e87a60396614075236afb65a28a9 + +commit cf9c1150515b8e9cc4f12e0d4787b3471b12ba4a +Author: kdevraje +Date: Thu Jan 3 09:51:46 2019 +0530 + + This commit adds a macro, which is to be enabled when BLIS is working on single instance mode + + Change-Id: I7f3fd654b78e64c4e6e24e9f0e245b1a30c492b0 + commit ad8d9adb09a7dd267bbdeb2bd1fbbf9daf64ee76 Author: Field G. Van Zee Date: Thu Jan 3 16:08:24 2019 -0600 @@ -1685,6 +3739,38 @@ Date: Thu Dec 20 16:27:26 2018 -0600 Merge branch 'master' into amd +commit 1f4eeee5175a8fc9ac312847c796ce6db5fe75b9 +Author: sraut +Date: Wed Dec 19 21:21:10 2018 +0530 + + Fixed BLAS test failures of small matrix SYRK for single and double precision. + + Details: + - SYRK for small matrix was implemented by reusing small GEMM routine. This was + resulting in output written to the full C matrix, and C being symmetric the + lower and upper triangles of C matrix contained same results. BLAS SYRK API + spec demands either lower or upper triangle of C matrix to be written with + results. So, this was resulting in BLAS test failures, even though testsuite + of BLIS was passing small SYRK operation. + - To fix BLAS test failures of small matrix SYRK, separate kernel routines are + implemented for small SYRK for both single and double precision. The newly + added small SYRK routines are in file kernels/zen/3/bli_syrk_small.c. + Now the intermediate results of matrix C are written to a scratch buffer. + Final results are written from scratch buffer to matrix C using SIMD + copy to either lower or upper traingle part of matrix C. + - Source and header files frame/3/syrk/bli_syrk_front.c and + frame/3/syrk/bli_syrk_front.h are changed to invoke new small SYRK routines. + + Change-Id: I9cfb1116c93d150aefac673fca033952ecac97cb + +commit 6d267375c3a0543f20604d74cc678ad91db3b6f1 +Author: sraut +Date: Wed Dec 19 14:22:21 2018 +0530 + + This commit improves the performance of multi-instance DGEMM when these multiple threads are binded to a CCX. + Multi-Instance: Each thread runs a sequential DGEMM. + Change-Id: I306920c8061b6dad61efac1dae68727f4ac27df6 + commit 0476f706b93e83f6b74a3d7b7e6e9cc9a1a52c3b Author: Field G. Van Zee Date: Tue Dec 18 14:56:20 2018 -0600 @@ -1717,6 +3803,16 @@ Date: Tue Dec 18 14:52:40 2018 -0600 into Debian package universe. Thanks to M. Zhou for sponsoring BLIS in Debian. +commit 7bf901e9265a1acd78e44c06f7178c8152c7e267 +Author: sraut +Date: Tue Dec 18 14:39:16 2018 +0530 + + Fix on EPYC machine for multi instance performance issue, + Issue: For the default values of mc, kc and nc with multi instance mode the performance across the cores dip drastically. + Fix: After experimentation found different set of values (mc, kc and nc) which fits in the cache size, and performance across the remains same across all the cores. + + Change-Id: I98265e3b7e61cd7602a0cc5596240e86c08c03fe + commit d2b2a0819a2fccad9165bc48c0e172d79a87542c Author: Field G. Van Zee Date: Mon Dec 17 19:26:35 2018 -0600 @@ -2419,7 +4515,7 @@ Date: Fri Oct 26 17:07:15 2018 -0500 output. - Very minor edits to docs/MixedDatatypes.md. -commit e90e7f309b3f2760a01e8e09a29bf702754fa2b5 (origin/win-pthreads, win-pthreads) +commit e90e7f309b3f2760a01e8e09a29bf702754fa2b5 (origin/win-pthreads) Author: Field G. Van Zee Date: Thu Oct 25 14:09:43 2018 -0500 @@ -2529,6 +4625,14 @@ Date: Tue Oct 23 19:16:54 2018 -0500 - Removed temporary play-test code for shiftd that accidentally got committed into test/3m4m/test_gemm.c. +commit 0ae9585da1e3db1cf8034d4b16305a5883beb0d3 +Author: pradeeptrgit +Date: Tue Oct 23 09:36:23 2018 +0530 + + Update version number to 1.2 + + Change-Id: Ibb31f6683cdecca6b218bc2f0c14701d7e92ebf3 + commit eac7d267a017d646a2c5b4fa565f4637ebfd9da7 Author: Field G. Van Zee Date: Mon Oct 22 18:10:59 2018 -0500 @@ -3004,6 +5108,14 @@ Date: Thu Oct 11 10:45:07 2018 -0500 Detect when OpenMP uses fewer threads than requested and correct accordingly, so that we don't wait forever for nonexistent threads. Fixes #267. +commit 78a6935483409ae277c766406e175772e820b1de +Author: sraut +Date: Thu Oct 11 10:49:40 2018 +0530 + + Added comments for the change in syrk small matrix change. + + Change-Id: I958939e9953323730da49ef07d1b10e578837d82 + commit 53a9ab1c85be14dcfd2560f5b16e898e3e258797 Author: Field G. Van Zee Date: Wed Oct 10 15:11:09 2018 -0500 @@ -3144,6 +5256,17 @@ Date: Thu Oct 4 20:39:06 2018 -0500 bli_clock_min_diff() is called. Thanks to Kiran Varaganti for reporting this issue. +commit f0c3ef359f7c6c1687fb2671cb35deb346e00597 +Author: Kiran V +Date: Thu Oct 4 16:32:21 2018 +0530 + + This is a fix to floating-point exception error for BLIS SGEMM with larger matrix sizes. + BUG No: CPUPL-197 fixed by Thangaraj Santanu + The bli_clock_min_diff() function in BLIS assumed that if the time taken is greater than 1 hour then the reading must be wrong. However this is not the case in general, while the other checks such as time taken closer to zero or nsec is ofcourse valid. + gerrit review: http://git.amd.com:8080/#/c/118694/1/frame/base/bli_clock.c + + Change-Id: I9dc313d7c5fdc20684f67a516bf3237de3e0694a + commit 8bf30eb4735872388b5317883d99b775a344ce25 Author: Devangi N. Parikh Date: Wed Oct 3 22:22:29 2018 -0400 @@ -3189,6 +5312,14 @@ Date: Wed Oct 3 13:57:25 2018 -0500 long to fit on a single line. - Changed some links from http to https. +commit 80a8b3dd8034ec8bc03d31be3f9c837c3f6fc94b +Author: sraut +Date: Wed Oct 3 15:30:33 2018 +0530 + + Review comments incorporated for small TRSM. + + Change-Id: Ia64b7b2c0375cc501c2cb0be8a1af93111808cd9 + commit b8dfd82e0d1afda4ee5436662d63515a59b2dee3 Author: Devin Matthews Date: Tue Oct 2 15:37:12 2018 -0500 @@ -3273,6 +5404,22 @@ Date: Mon Oct 1 14:04:30 2018 -0500 Details: - Added language mentioning SHPC group to Introduction. +commit ee46fa3efb6e920fa6c3d0b0601007f5de31deb5 +Author: sraut +Date: Mon Oct 1 16:30:30 2018 +0530 + + Small TRSM optimization changes :- 1) single precision small trsm kernels for XAt=B case are further optimized for performance. 2) double precision small trsm kernels for AX=B and XAtB cases are implemented. 3) single precision small trsm kernels for AutX=B are implemented in intrinsics to improve the current performance. + + Change-Id: Ic9d67ae6d8522615257dde018903f049dcffa2cf + +commit 08045a6c52b6e025652c5b18eb120c0f4e61cf6f +Author: sraut +Date: Mon Oct 1 15:38:23 2018 +0530 + + Corrected the fix made for blastest level-3 failure to check m,n,k non-zero condition in bli_gemm_small.c + + Change-Id: Idaf9f2327c3127b04a2738ae8a058b83d6c57934 + commit ac18949a4b9613741b9ea8e5026d8083acef6fe4 Author: Field G. Van Zee Date: Sun Sep 30 18:54:56 2018 -0500 @@ -3337,6 +5484,23 @@ Date: Fri Sep 28 11:25:54 2018 -0500 no longer needed (issue #257). Thanks to M. Zhou and Nico Schlömer for their contributions. +commit 9814cfdf3157ef4726ee604fc895d56e8063d765 +Author: Meghana +Date: Fri Sep 28 11:02:39 2018 +0530 + + fixed blastest level-3 failure by adding ((M&N&K) != 0) to check condition in bli_gemm_small.c + + Change-Id: I85e4a32996ebb880f3c00bd293edc38f74700fe6 + +commit 86330953b14c180862deef3ccdcc6431259be27b +Merge: 7af5283d 807a6548 +Author: praveeng +Date: Fri Sep 28 10:08:06 2018 +0530 + + Resolved conflicts and modified bli_trsm_small.c + + Change-Id: I578d419cff658003e0fdd4c4cdc93145d951ce31 + commit 60b2650d7406d266feffe232c2d5692a9e3886d0 Author: Field G. Van Zee Date: Mon Sep 24 15:04:45 2018 -0500 @@ -5260,6 +7424,14 @@ Date: Mon Jun 11 12:32:54 2018 -0500 functions were the ones to inherit the 1r functionality. The kernels have now been renamed to use a _1er suffix. +commit 7af5283dcc3dded114852d6013d33134021b81aa +Author: sraut +Date: Mon Jun 11 15:00:22 2018 +0530 + + added check condition on n-dimension for XA'=B intrinsic code to process till 128 size + + Change-Id: I95d020a5ca3ea21d446b8c2e379d56e1eea18530 + commit 712de9b371a8727682352a2f52cd4880de905f0b Author: Field G. Van Zee Date: Sat Jun 9 14:36:30 2018 -0500 @@ -5434,6 +7606,22 @@ Date: Wed Jun 6 15:35:05 2018 -0500 comment (I used "sed //d" to remove the lines). This fixes the broken 'make checkblis-fast' (and 'make check') targets. +commit 695cd520e2f5eab938f66afe9fe36201ab2700c5 +Author: sraut +Date: Wed Jun 6 11:48:56 2018 +0530 + + AMD Copyright information changed to 2018 + + Change-Id: Idfd11afd5d252f8063d0158680d24bf7e2854469 + +commit df1dd24fd896821de60917b429f303bab7fd0d4b +Author: sraut +Date: Wed Jun 6 11:24:33 2018 +0530 + + small matrix trsm intrinsics optimization code for AX=B and XA'=B + + Change-Id: I90123c4d9adbd314c867995cd19dc975150b448c + commit 3f48c38164b4135515b5c752c506fdccc4480be2 Author: Field G. Van Zee Date: Tue Jun 5 16:52:35 2018 -0500 @@ -5474,6 +7662,22 @@ Date: Tue Jun 5 14:17:39 2018 +0200 Make bli_auxinfo_next_b() return b_next, not a_next (#216) +commit d4c24ea5f644eb635046e7fe249d3e8e58b4c98a +Author: sraut +Date: Tue Jun 5 15:42:59 2018 +0530 + + copyright message changed to 2018 + + Change-Id: I33c1ebda41bc7f1973ff19e3b1947bdad62b4d44 + +commit 3f1ba4e646776699ebfaa042fe24691d9e2f55d0 +Author: sraut +Date: Tue Jun 5 14:21:13 2018 +0530 + + copyright changed to 2018 + + Change-Id: Ie916c7cd6f95aedc3cab6eec3a703c9ddb333bc3 + commit bd02c4e9f7fe07487276e61507335d48c8e05f35 Author: Field G. Van Zee Date: Mon Jun 4 13:42:17 2018 -0500 @@ -7107,6 +9311,22 @@ Date: Tue Mar 20 13:54:58 2018 -0500 - Renamed some targets in the top-level Makefile to be consistent between BLAS and BLIS. +commit fc53ad6c5b2e39238b1bbbf625cc0c638b9da4e1 +Author: Nisanth M P +Date: Mon Mar 19 12:49:26 2018 +0530 + + Re-enabling the small matrix gemm optimization for target zen + + Change-Id: I13872784586984634d728cd99a00f71c3f904395 + +commit d12d34e167d7dc32732c0ed135f8065a55088106 +Author: Nisanth M P +Date: Mon Mar 19 11:34:32 2018 +0530 + + Re-enabling Zen optimized cache block sizes for config target zen + + Change-Id: I8191421b876755b31590323c66156d4a814575f1 + commit 40fa10396c0a3f9601cf49f6b6cd9922185c932e Author: Field G. Van Zee Date: Mon Mar 19 18:19:43 2018 -0500 @@ -7311,6 +9531,15 @@ Date: Sun Mar 11 16:59:50 2018 -0500 bli_dgemm_cortexa57_asm_6x8 -> bli_dgemm_armv8a_asm_6x8 Thanks to Jacob Gorm Hansen for reporting this issue. +commit 28bcea37dfcf0eb99a99da6f46de2a2830393d1d +Merge: b1ea3092 8b0475a8 +Author: praveeng +Date: Fri Mar 9 19:13:08 2018 +0530 + + Merge master code till 06_mar_2018 to amd-staging + + Change-Id: I12267e5999c92417e3715fef4f36ac2131d00f1a + commit 48da9f5805f0a49f6ad181ae2bf57b4fde8e1b0a Author: Field G. Van Zee Date: Wed Mar 7 12:54:06 2018 -0600 @@ -7382,6 +9611,14 @@ Date: Sat Mar 3 13:13:39 2018 -0600 kernels in kernels/knl/1m/. Thanks to Dave Love for reporting this issue. +commit b1ea30925dff751eced23dfa94ff578a20ea0b94 +Author: Field G. Van Zee +Date: Fri Feb 23 17:42:48 2018 -0600 + + CHANGELOG update (0.3.0) + + Change-Id: Id038b00a62de51c9818ad249651ec5dc662f4415 + commit 1ef9360b1fd0209fbeb5766f7a35402fbd080fcb Author: Field G. Van Zee Date: Thu Mar 1 14:36:39 2018 -0600 @@ -7434,18 +9671,18 @@ Date: Wed Feb 28 15:30:14 2018 -0600 bli_cgemm_zen_asm_3x8() and bli_zgemm_zen_asm_3x4(), in bli_cntx_init_zen.c. This was actually intended for 1681333. -commit d9079655c9cbb903c6761d79194a21b7c0a322bc -Author: Field G. Van Zee -Date: Fri Feb 23 17:42:48 2018 -0600 - - CHANGELOG update (0.3.0) - commit 709f8361ebc90b96b02ebe5c5ffb6fc3b1b25e58 (tag: 0.3.0) Author: Field G. Van Zee Date: Fri Feb 23 17:42:48 2018 -0600 Version file update (0.3.0) +commit d9079655c9cbb903c6761d79194a21b7c0a322bc +Author: Field G. Van Zee +Date: Fri Feb 23 17:42:48 2018 -0600 + + CHANGELOG update (0.3.0) + commit 3defc7265c12cf85e9de2d7a1f243c5e090a6f9d (origin/master, origin/HEAD) Author: Field G. Van Zee Date: Fri Feb 23 17:38:19 2018 -0600 diff --git a/CREDITS b/CREDITS index 6e47f8717..d86c093d9 100644 --- a/CREDITS +++ b/CREDITS @@ -15,9 +15,11 @@ but many others have contributed code and feedback, including Erling Andersen @erling-d-andersen Alex Arslan @ararslan Vernon Austel (IBM, T.J. Watson Research Center) + Satish Balay @balay (Argonne National Laboratory) Matthew Brett @matthew-brett (University of Birmingham) Jed Brown @jedbrown (Argonne National Laboratory) Robin Christ @robinchrist + Mat Cross @matcross (NAG) Kay Dewhurst @jkd2016 (Max Planck Institute, Halle, Germany) Jeff Diamond (Oracle) Johannes Dieterich @iotamudelta @@ -43,23 +45,27 @@ but many others have contributed code and feedback, including Tony Kelman @tkelman Lee Killough @leekillough (Cray) Mike Kistler @mkistler (IBM, Austin Research Laboratory) + Kyungmin Lee @kyungminlee (Ohio State University) Michael Lehn @michael-lehn - @ShmuelLevine + Shmuel Levine @ShmuelLevine Dave Love @loveshack Tze Meng Low (The University of Texas at Austin) Ye Luo @ye-luo (Argonne National Laboratory) Ricardo Magana @magania (Hewlett Packard Enterprise) Bryan Marker @bamarker (The University of Texas at Austin) + Simon Lukas Märtens @ACSimon33 (RWTH Aachen University) Devin Matthews @devinamatthews (The University of Texas at Austin) Stefanos Mavros @smavros + Bhaskar Nallani @BhaskarNallani (AMD) Nisanth Padinharepatt (AMD) + Ajay Panyala @ajaypanyala Devangi Parikh @dnparikh (The University of Texas at Austin) Elmar Peise @elmar-peise (RWTH-Aachen) Clément Pernet @ClementPernet Ilya Polkovnichenko Jack Poulson @poulson (Stanford) Mathieu Poumeyrol @kali - Christos Psarras @ChrisPsa (RWTH-Aachen) + Christos Psarras @ChrisPsa (RWTH Aachen University) @qnerd Michael Rader @mrader1248 Pradeep Rao @pradeeptrgit (AMD) @@ -73,7 +79,7 @@ but many others have contributed code and feedback, including Nathaniel Smith @njsmith Shaden Smith @ShadenSmith Tyler Smith @tlrmchlsmth (The University of Texas at Austin) - Paul Springer @springer13 (RWTH-Aachen) + Paul Springer @springer13 (RWTH Aachen University) Adam J. Stewart @adamjstewart (University of Illinois at Urbana-Champaign) Vladimir Sukarev Santanu Thangaraj (AMD) diff --git a/README.md b/README.md index efc8432bd..153138896 100644 --- a/README.md +++ b/README.md @@ -113,16 +113,16 @@ and high performance." Their statement continues, "The framework will continue having an important influence on the design and the instantiation of dense linear algebra libraries." - * **Small/skinny matrix support for dgemm now available!** Thanks to + * **Multithreaded small/skinny matrix support for dgemm now available!** Thanks to contributions made possible by our partnership with AMD, we have dramatically accelerated `gemm` for double-precision real matrix problems where one or two dimensions is exceedingly small. A natural byproduct of this optimization is that the traditional case of small _m = n = k_ (i.e. square matrices) is also accelerated, even though it was not targeted specifically. And though only -`dgemm` was optimized for now, support for other datatypes, other operations, -and/or multithreading may be implemented in the future. We've also added a new -[PerformanceSmall](docs/PerformanceSmall.md) document to showcase the -improvement in performance when some matrix dimensions are small. +`dgemm` was optimized for now, support for other datatypes and/or other operations +may be implemented in the future. We've also added new graphs to the +[PerformanceSmall](docs/PerformanceSmall.md) document to showcase multithreaded +performance when one or more matrix dimensions are small. * **Performance comparisons now available!** We recently measured the performance of various level-3 operations on a variety of hardware architectures, @@ -489,6 +489,12 @@ Debian package tracker can be found [here](https://tracker.debian.org/pkg/blis). (Also, thanks to [Nico Schlömer](https://github.com/nschloe) for previously volunteering his time to set up a standalone PPA.) + * **Gentoo**. [M. Zhou](https://github.com/cdluminate) also maintains the +[BLIS package](https://packages.gentoo.org/packages/sci-libs/blis) entry for +[Gentoo](https://www.gentoo.org/), a Linux distribution known for its +source-based [portage](https://wiki.gentoo.org/wiki/Portage) package manager +and distribution system. + * **EPEL/Fedora**. There are official BLIS packages in Fedora and EPEL (for RHEL7+ and compatible distributions) with versions for 64-bit integers, OpenMP, and pthreads, and shims which can be dynamically linked instead of reference @@ -637,13 +643,13 @@ A fifth paper, submitted to ACM TOMS, begins the study of so-called ``` A sixth paper, submitted to ACM TOMS, revisits the topic of the previous -article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev2.pdf): +article and derives a [superior induced method](http://www.cs.utexas.edu/users/flame/pubs/blis6_sisc_rev1.pdf): ``` @article{BLIS6, author = {Field G. {V}an~{Z}ee}, title = {Implementing High-Performance Complex Matrix Multiplication via the 1m Method}, - journal = {ACM Transactions on Mathematical Software}, + journal = {SIAM Journal on Scientific Computing}, note = {submitted} } ``` diff --git a/RELEASING b/RELEASING index bc2c9dc59..351594c49 100644 --- a/RELEASING +++ b/RELEASING @@ -26,14 +26,19 @@ Here are the steps to follow to create a new release (version) of BLIS: 6. Update docs/ReleaseNotes.md file with body of finalized announcement and the date of the release. -7. Bump the version number: +7. Commit changes from steps 5 and 6. + +8. Bump the version number: $ ./build/bump-version.sh "0.3.2" -8. Push the new commits and new tag associated with the new version: + This will result in two new commits: a version file update and a CHANGELOG + file update. + +9. Push the new commits and new tag associated with the new version: $ git push $ git push --tag -9. Send finalized announcement to blis-devel. +10. Send finalized announcement to blis-devel. diff --git a/attic/windows/Makefile b/attic/windows/Makefile deleted file mode 100644 index f015fe14f..000000000 --- a/attic/windows/Makefile +++ /dev/null @@ -1,341 +0,0 @@ -# -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - - - -# -# --- Include variables determined at configure-time -------------------------- -# -CONFIGURE_DEFS = config\config.mk - -!if exist ( $(CONFIGURE_DEFS) ) -!include $(CONFIGURE_DEFS) -!else -!error nmake: $(CONFIGURE_DEFS) does not exist! Run configure.cmd first. -!endif - - - -# -# --- Include environment- and build-specific definitions ---------------------- -# - -MAKE_DEFS = build\defs.mk - -# Include build definitions -!if exist ( $(MAKE_DEFS) ) -!include $(MAKE_DEFS) -!else -!error nmake: $(MAKE_DEFS) does not exist! Your libblis distribution may be incomplete. -!endif - - - -# -# --- Variable modifications --------------------------------------------------- -# - - - -# -# --- High-level rules --------------------------------------------------------- -# - -all: libblis - -libblis: libblis-lib - -libblis-objs: $(BLIS_OBJS) - -libblis-lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB) - -libblis-dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL) - -lib: libblis-lib - -dll: libblis-dll - -install: install-lib install-headers - -install-lib: $(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib - -install-dll: $(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll \ - $(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib \ - $(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp - -install-headers: $(INSTALL_PREFIX_INC)\$(BLIS_H) - -clean: clean-build clean-log - -distclean: clean-config clean-build clean-log - - - -# -# --- Source code (inference) rules -------------------------------------------- -# - -# --- C source files in flamec directory --- -{$(SRC_BLI_DIRPATH)}.c{$(OBJ_BLI_DIRPATH)}.obj: -!ifdef VERBOSE - if not exist $(OBJ_BLI_DIRPATH) \ - ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) - $(CC) $(CFLAGS) /c $< /Fo$@ -!else - @if not exist $(OBJ_BLI_DIRPATH) \ - ( ( $(ECHO) nmake: Creating $(OBJ_BLI_DIRPATH) directory ) & \ - ( $(MKDIR) $(OBJ_BLI_DIRPATH) ) ) - @$(ECHO) nmake: Compiling $< - @$(CC) $(CFLAGS) /c $< /Fo$@ >> $(CC_LOG_FILE) -!endif - - - -# -# --- Library generation rules ------------------------------------------------- -# - -# --- Static library --- -$(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS_LIB): libblis-objs -!ifdef VERBOSE - if not exist $(LIB_LIBBLIS_DIRPATH) \ - ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) - $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) - $(CD) $(LIB_LIBBLIS_DIRPATH) - $(LIB) $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS) - $(DEL) *.obj - $(CD) $(TOP_BUILD_DIR_ABS) -!else - @if not exist $(LIB_LIBBLIS_DIRPATH) \ - ( ( $(ECHO) nmake: Creating $(LIB_LIBBLIS_DIRPATH) directory ) & \ - ( $(MKDIR) $(LIB_LIBBLIS_DIRPATH) ) ) - @$(ECHO) nmake: Creating static library $@ - @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(LIB_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE) - @$(CD) $(LIB_LIBBLIS_DIRPATH) - @$(LIB) /VERBOSE $(LIB_OPTIONS) $(LIB_BLI_OUTPUT_ARG) $(LIB_BLI_INPUT_ARGS) - @$(DEL) *.obj - @$(CD) $(TOP_BUILD_DIR_ABS) -!endif - -# --- Dynamic library (object code file, import library, and export file) --- -$(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS_DLL): libblis-objs -!ifdef VERBOSE - if not exist $(DLL_LIBBLIS_DIRPATH) \ - ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) - $(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE) - $(CD) $(DLL_LIBBLIS_DIRPATH) - $(DIR) /B *.obj > $(OBJ_LIST_FILE) - $(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE) - $(DEL) $(OBJ_LIST_FILE) - $(DEL) *.obj - $(CD) $(TOP_BUILD_DIR_ABS) -!else - @if not exist $(DLL_LIBBLIS_DIRPATH) \ - ( ( $(ECHO) nmake: Creating $(DLL_LIBBLIS_DIRPATH) directory ) & \ - ( $(MKDIR) $(DLL_LIBBLIS_DIRPATH) ) ) - @$(ECHO) nmake: Creating dynamic library $@ - @$(COPY) $(OBJ_BLI_DIRPATH)\*.obj $(DLL_LIBBLIS_DIRPATH) >> $(COPY_LOG_FILE) - @$(CD) $(DLL_LIBBLIS_DIRPATH) - @$(DIR) /B *.obj > $(OBJ_LIST_FILE) - @$(GENDLL) $(LIBBLIS) $(LIBBLIS) $(CC) $(LINKARGS_FILEPATH) $(SYM_DEF_FILEPATH) /objlist $(OBJ_LIST_FILE) - @$(DEL) $(OBJ_LIST_FILE) - @$(DEL) *.obj - @$(CD) $(TOP_BUILD_DIR_ABS) -!endif - - - -# -# --- Install rules ------------------------------------------------------------ -# - -# --- Header files --- -$(INSTALL_PREFIX_INC)\$(BLIS_H): $(INC_BLI_DIRPATH)\$(BLIS_H) \ - $(BUILD_DIRNAME)\$(BLI_CONFIG_H) -!ifdef VERBOSE - if not exist $(INSTALL_PREFIX_INC) \ - ( $(MKDIR) $(INSTALL_PREFIX_INC) ) - $(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE) - $(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE) -!else - @if not exist $(INSTALL_PREFIX_INC) \ - ( $(MKDIR) $(INSTALL_PREFIX_INC) ) - @$(ECHO) nmake: Installing libblis header files to $(INSTALL_PREFIX_INC) - @$(COPY) $(BUILD_DIRNAME)\$(BLI_CONFIG_H) $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE) - @$(COPY) $(INC_BLI_DIRPATH)\*.h $(INSTALL_PREFIX_INC) >> $(COPY_LOG_FILE) -!endif - -# --- Static library --- -$(INSTALL_PREFIX_LIB)\$(LIBBLIS).lib: $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib -!ifdef VERBOSE - if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) ) - if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \ - ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) -!else - @if not exist $(INSTALL_PREFIX_LIB) ( $(MKDIR) $(INSTALL_PREFIX_LIB) ) - @if exist $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \ - ( ( $(ECHO) nmake: Installing $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_LIB) ) & \ - ( $(COPY) $(LIB_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_LIB) >> $(COPY_LOG_FILE) ) ) -!endif - -# --- Dynamic library (object code) --- -$(INSTALL_PREFIX_DLL)\$(LIBBLIS).dll: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll -!ifdef VERBOSE - if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) -!else - @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll \ - ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll to $(INSTALL_PREFIX_DLL) ) & \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).dll $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) ) -!endif - -# --- Dynamic library (import library) --- -$(INSTALL_PREFIX_DLL)\$(LIBBLIS).lib: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib -!ifdef VERBOSE - if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) -!else - @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib \ - ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib to $(INSTALL_PREFIX_DLL) ) & \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).lib $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) ) -!endif - -# --- Dynamic library (export file) --- -$(INSTALL_PREFIX_DLL)\$(LIBBLIS).exp: $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp -!ifdef VERBOSE - if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) -!else - @if not exist $(INSTALL_PREFIX_DLL) ( $(MKDIR) $(INSTALL_PREFIX_DLL) ) - @if exist $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp \ - ( ( $(ECHO) nmake: Installing $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp to $(INSTALL_PREFIX_DLL) ) & \ - ( $(COPY) $(DLL_LIBBLIS_DIRPATH)\$(LIBBLIS).exp $(INSTALL_PREFIX_DLL) >> $(COPY_LOG_FILE) ) ) -!endif - - - -# -# --- Clean rules -------------------------------------------------------------- -# - -clean-log: -!ifdef VERBOSE - if exist $(CC_LOG_FILE) \ - ( $(DEL) $(CC_LOG_FILE) ) - if exist $(FC_LOG_FILE) \ - ( $(DEL) $(FC_LOG_FILE) ) - if exist $(COPY_LOG_FILE) \ - ( $(DEL) $(COPY_LOG_FILE) ) -!else - @if exist $(CC_LOG_FILE) \ - ( ( $(ECHO) nmake: Deleting $(CC_LOG_FILE) ) & \ - ( $(DEL) $(CC_LOG_FILE) ) ) - @if exist $(FC_LOG_FILE) \ - ( ( $(ECHO) nmake: Deleting $(FC_LOG_FILE) ) & \ - ( $(DEL) $(FC_LOG_FILE) ) ) - @if exist $(COPY_LOG_FILE) \ - ( ( $(ECHO) nmake: Deleting $(COPY_LOG_FILE) ) & \ - ( $(DEL) $(COPY_LOG_FILE) ) ) -!endif - -clean-config: -!ifdef VERBOSE - if exist $(CNF_DIRNAME) \ - ( $(RMDIR) $(CNF_DIRNAME) ) - if exist $(INC_DIRNAME) \ - ( $(RMDIR) $(INC_DIRNAME) ) - if exist $(SRC_DIRNAME) \ - ( $(RMDIR) $(SRC_DIRNAME) ) -!else - @if exist $(CNF_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(CNF_DIRNAME) directory ) & \ - ( $(RMDIR) $(CNF_DIRNAME) ) ) - @if exist $(INC_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(INC_DIRNAME) directory ) & \ - ( $(RMDIR) $(INC_DIRNAME) ) ) - @if exist $(SRC_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(SRC_DIRNAME) directory ) & \ - ( $(RMDIR) $(SRC_DIRNAME) ) ) -!endif - -clean-build: -!ifdef VERBOSE - if exist $(OBJ_DIRNAME) \ - ( $(RMDIR) $(OBJ_DIRNAME) ) - if exist $(LIB_DIRNAME) \ - ( $(RMDIR) $(LIB_DIRNAME) ) - if exist $(DLL_DIRNAME) \ - ( $(RMDIR) $(DLL_DIRNAME) ) -!else - @if exist $(OBJ_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(OBJ_DIRNAME) directory ) & \ - ( $(RMDIR) $(OBJ_DIRNAME) ) ) - @if exist $(LIB_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \ - ( $(RMDIR) $(LIB_DIRNAME) ) ) - @if exist $(DLL_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \ - ( $(RMDIR) $(DLL_DIRNAME) ) ) -!endif - -# Useful for developing when all we want to do is remove the library products. -clean-lib: -!ifdef VERBOSE - if exist $(LIB_DIRNAME) \ - ( $(RMDIR) $(LIB_DIRNAME) ) - if exist $(DLL_DIRNAME) \ - ( $(RMDIR) $(DLL_DIRNAME) ) -!else - @if exist $(LIB_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(LIB_DIRNAME) directory ) & \ - ( $(RMDIR) $(LIB_DIRNAME) ) ) - @if exist $(DLL_DIRNAME) \ - ( ( $(ECHO) nmake: Deleting $(DLL_DIRNAME) directory ) & \ - ( $(RMDIR) $(DLL_DIRNAME) ) ) -!endif - - - -# -# --- Help target -------------------------------------------------------------- -# - -help: - @$(NMAKE_HELP) - diff --git a/attic/windows/build/config.mk.in b/attic/windows/build/config.mk.in deleted file mode 100644 index 5b4ad8a27..000000000 --- a/attic/windows/build/config.mk.in +++ /dev/null @@ -1,52 +0,0 @@ -# -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - -# -# --- Configuration variable definitions --------------------------------------- -# -# Environment-related variables: -# REVISION - The code's revision number. -# PWD - The path to current working directory. -# ARCH_STR - A string to identify the requested build architecture. -# BUILD_STR - A string to identify the requested build type. -# CCOMPILER_STR - A string to identify the requested C compiler. -# -# Target-related variables: -# FLAMEC_OBJS - List of paths to flamec object files. -# LAPACK2FLAMEC_OBJS - List of paths to lapack2flamec object files. -# -# Note: these variables are not present in the .in template file. Instead, they -# are appended to the contents of the .in file by a build script and output to -# a separate file (by the same name, without the .in extension). -# diff --git a/attic/windows/build/defs.mk b/attic/windows/build/defs.mk deleted file mode 100644 index 84b52b9ae..000000000 --- a/attic/windows/build/defs.mk +++ /dev/null @@ -1,240 +0,0 @@ -# -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - - -# -# --- General build system options -------------------------------------------- -# - -# Uncomment this for verbose output from nmake. -# VERBOSE = 1 - -# Assign this varible to be the full path to the directory to which you would -# like the BLIS build products to be installed upon running "nmake install". -# The nmake install target will create the install directory and all requisite -# subdirectories if they do not already exist (in which case the user must have -# permission to create these directories). -INSTALL_PREFIX = c:\field\lib - - -# -# --- Important build system filenames ---------------------------------------- -# - -# DLL link arguments. The contents of this file should be customized when -# building a dynamically-linked library. The lines of the file should contain -# linker options, library names, and library paths. Note that the library -# paths must be declared in the following form: -# -# /link /LIBPATH: -# /link /LIBPATH: -# /link /LIBPATH: -# -# where , , and are library paths to add to the list -# of paths to search when the linker attempts to locate other libraries -# listed in the file. -LINKARGS_FILENAME = linkargs.txt -LINKARGS_FILEPATH = $(PWD)\$(LINKARGS_FILENAME) - -# Various log file names that capture standard output when VERBOSE is undefined. -CC_LOG_FILE = nmake-cc.log -FC_LOG_FILE = nmake-fc.log -COPY_LOG_FILE = nmake-copy.log - - -# -# --- General name and directory definitions ----------------------------------- -# - -# The relative and absolute locations of the top-level Windows build directory. -# This is the directory in which nmake is run (not the directory named "build"). -TOP_BUILD_DIR_REL = . -TOP_BUILD_DIR_ABS = $(PWD) - -# The revision string. -REV_STR = r$(REVISION) - -# The names of the libraries. -LIBBLIS_NAME_ONLY = libblis -LIBBLIS = $(LIBBLIS_NAME_ONLY)-$(ARCH_STR)-$(REV_STR) - -# Directories that reside within the top-level Windows directory. -CNF_DIRNAME = config -INC_DIRNAME = include -SRC_DIRNAME = frame -OBJ_DIRNAME = obj -LIB_DIRNAME = lib -DLL_DIRNAME = dll - -# Leaves of interest for Windows. - -# Relative directory paths to each of the above subdirectories. -INC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(INC_DIRNAME) -SRC_DIRPATH = $(TOP_BUILD_DIR_REL)\$(SRC_DIRNAME) -OBJ_DIRPATH = $(TOP_BUILD_DIR_REL)\$(OBJ_DIRNAME) -LIB_DIRPATH = $(TOP_BUILD_DIR_REL)\$(LIB_DIRNAME) -DLL_DIRPATH = $(TOP_BUILD_DIR_REL)\$(DLL_DIRNAME) - -# We only have header files for flamec leaves. -INC_BLI_DIRPATH = $(INC_DIRPATH) - -# We have source code for flamec and lapack2flamec leaves. -SRC_BLI_DIRPATH = $(SRC_DIRPATH) - - -# And we have object file paths corresponding to those source leaves defined -# above. -OBJ_BLI_DIRPATH = $(OBJ_DIRPATH)\$(ARCH_STR)\$(BUILD_STR) - -# Separate directories into which we'll move object files when we create the -# static libraries. -LIB_LIBBLIS_DIRPATH = $(LIB_DIRPATH)\$(ARCH_STR)\$(BUILD_STR) - -# Separate directories into which we'll move object files when we create the -# dynamic libraries. -DLL_LIBBLIS_DIRPATH = $(DLL_DIRPATH)\$(ARCH_STR)\$(BUILD_STR) - -# The install subdirectories. -INSTALL_PREFIX_LIB = $(INSTALL_PREFIX)\libblis\lib -INSTALL_PREFIX_DLL = $(INSTALL_PREFIX)\libblis\dll -INSTALL_PREFIX_INC = $(INSTALL_PREFIX)\libblis\include-$(ARCH_STR)-$(REV_STR) - -# Definitions for important header files used in the install-headers rule. -BUILD_DIRNAME = build -BLIS_H = blis.h - - -# -# --- General shell definitions ------------------------------------------------ -# - -CD = cd -DIR = dir -COPY = copy -DEL = del /F /Q -MKDIR = mkdir -RMDIR = rd /S /Q -ECHO = echo - - -# -# --- Helper scripts ----------------------------------------------------------- -# - -NMAKE_HELP = .\build\nmake-help.cmd - - - -# -# --- Compiler-related definitions --------------------------------------------- -# - -#!include $(VERSION_FILE) - -# --- C compiler definitions --- - -WINDOWS_BUILD = BLIS_ENABLE_WINDOWS_BUILD -VERS_STR = 0.0.9 -VERSION = BLIS_VERSION_STRING=\"$(VERS_STR)\" - -!if "$(CCOMPILER_STR)"=="icl" - -!if "$(BUILD_STR)"=="debug" -CDEBUG = /Zi -COPTIM = /Od -!elseif "$(BUILD_STR)"=="release" -CDEBUG = -COPTIM = /Ox -!endif - -CC = icl.exe -CMISCFLAGS = /nologo -CLANGFLAGS = -CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION) -CWARNFLAGS = /w -CDBGFLAGS = $(CDEBUG) -COPTFLAGS = $(COPTIM) -CRTIMEFLAGS = /MT -CMTHREADFLAGS = /Qopenmp -CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \ - $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS) - -!elseif "$(CCOMPILER_STR)"=="cl" - -!if "$(BUILD_STR)"=="debug" -CDEBUG = /Zi -COPTIM = /Od -!elseif "$(BUILD_STR)"=="release" -CDEBUG = -COPTIM = /Ox -!endif - -CC = cl.exe -CMISCFLAGS = /nologo -CLANGFLAGS = -CPPROCFLAGS = /I.\build /I$(INC_BLI_DIRPATH) /D$(WINDOWS_BUILD) /D$(VERSION) -CWARNFLAGS = /w -CDBGFLAGS = $(CDEBUG) -COPTFLAGS = $(COPTIM) -CRTIMEFLAGS = /MT -CMTHREADFLAGS = /openmp -CFLAGS = $(CMISCFLAGS) $(CLANGFLAGS) $(CPPROCFLAGS) $(CWARNFLAGS) \ - $(CDBGFLAGS) $(COPTFLAGS) $(CRTIMEFLAGS) $(CMTHREADFLAGS) - -!endif - - - -# -# --- Library-related definitions ---------------------------------------------- -# - -# --- Static library definitions --- - -LIBBLIS_LIB = $(LIBBLIS).lib - -LIB = lib -LIB_OPTIONS = /nologo -LIB_BLI_OUTPUT_ARG = /out:$(LIBBLIS_LIB) -LIB_BLI_INPUT_ARGS = *.obj - -# --- Dynamic library definitions --- - -LIBBLIS_DLL = $(LIBBLIS).dll - -GENDLL = $(TOP_BUILD_DIR_ABS)\gendll.cmd -OBJ_LIST_FILE = libblis-objects.txt - -SYM_DEF_FILEPATH = $(TOP_BUILD_DIR_ABS)\$(BUILD_DIRNAME)\libblis-symbols.def - diff --git a/attic/windows/build/gather-src-for-windows.py b/attic/windows/build/gather-src-for-windows.py deleted file mode 100644 index e3b589b5b..000000000 --- a/attic/windows/build/gather-src-for-windows.py +++ /dev/null @@ -1,351 +0,0 @@ -#! /usr/bin/env python -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - -# ------------------------------------------------------------------------------ - -# Import modules -import sys -import os -import os.path -import getopt -import shutil -import string - -# Global variables for command line options, with default settings. -script_name = "" -dry_run_flag = False -verbose_flag = False - -# Global constants -flat_config_dirname = "config" -flat_header_dirname = "include" -flat_source_dirname = "frame" -leaf_list_path = "build/leaf_list" -ignore_list_path = "build/ignore_list" -ignore_list_win_path = "build/ignore_list.windows" - -# ------------------------------------------------------------------------------ - -def print_usage(): - - # Print help information. - print " " - print " %s" % script_name - print " " - print " Field G. Van Zee" - print " " - print " Walk the BLIS source tree and copy all sources necessary for" - print " building BLIS under Windows into a single flat directory with" - print " no subdirectory hierarchy." - print " " - print " Usage:" - print " %s [options] tree_dir flat_dir" % script_name - print " " - print " The following options are accepted:" - print " " - print " -d dry-run" - print " Go through all the motions, but don't actually copy any" - print " files." - print " -v verbose" - print " Be verbose about actions (one line of output her action)." - print " " - - # Exit the script. - sys.exit() - -# ------------------------------------------------------------------------------ - -def main(): - - # Extern our global veriables. - global script_name - global dry_run_flag - global verbose_flag - - # Get the script name so we can use it in our output. - ( script_dir, script_name ) = os.path.split( sys.argv[0] ) - - try: - - # Get the command line options. - options, args = getopt.getopt( sys.argv[1:], "dv") - - except getopt.GetoptError, err: - - # print help information and exit: - print str( err ) # will print something like "option -a not recognized" - print_usage() - - # Parse our expected command line options. - print 'checking options' - for o, a in options: - - if o == "-d": - print 'found dry run' - dry_run_flag = True - elif o == "-v": - verbose_flag = True - else: - assert False, "unhandled option" - - # Check the number of arguments after command line option processing. - n_args = len( args ) - if n_args != 2: - print_usage() - - # Acquire the non-optional arguments. - tree_dir = args[0] - flat_dir = args[1] - - # Acquire the list of directories we will ignore. - ignore_list = read_ignore_list() - - # Acquire the list of leaf-type directories we will descend into. - leaf_list = read_leaf_list() - - # Create strings for each of the base subdirectories in the flat - # destination directory. - flat_config_base_dirpath = os.path.join( flat_dir, flat_config_dirname ) - flat_header_base_dirpath = os.path.join( flat_dir, flat_header_dirname ) - flat_source_base_dirpath = os.path.join( flat_dir, flat_source_dirname ) - - # Start a list of directories to create. - dirs_to_create = [] - - # Append the config directory. We do this outside of the for loop because - # we don't need subdirectories for each leaf type. - dirs_to_create.append( flat_config_base_dirpath ) - - # For each of the leaf specifications, make the full pathnames of the - # subdirectories that will reside within the root destination directory. - for leaf_spec in leaf_list: - - # Unpack the leaf_spec tuple. - src_exts, hdr_exts = leaf_spec - - # Append the directory path name to our list. - dirs_to_create.append( flat_header_base_dirpath ) - dirs_to_create.append( flat_source_base_dirpath ) - - # Iterate over the directory list we just created. - for dirpath in dirs_to_create: - - # Make the subdirectories within the root destination directory, but - # only if they are not existing directories. - if os.path.isdir( dirpath ) == False: - - # Take action only if this is not a dry run. - if dry_run_flag == False: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: creating directory %s" % ( script_name, dirpath ) - - # Make the directory, and parent directories, for dirpath. - os.makedirs( dirpath ) - - else: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: (dry-run) creating directory %s" % ( script_name, dirpath ) - - - # Walk the directory structure top-down. - for dirpath, dirnames, filenames in os.walk( tree_dir ): - - # Remove directories that appear in the ignore list. - for item in ignore_list: - if item in dirnames: - dirnames.remove( item ) - - # Consider each leaf specification. If we find the name in the directory - # path, then copy the files with its designated extensions into the flat - # source directory. - for leaf_spec in leaf_list: - - # Unpack the leaf_spec tuple. - src_exts, hdr_exts = leaf_spec - - # At this point following line can probably be removed - type_dir_name = os.sep + '' - - flat_source_leaf_dirpath = flat_source_base_dirpath - flat_header_leaf_dirpath = flat_header_base_dirpath - - if dirpath.find( type_dir_name ) != -1: - copy_files_to_flat_subdirs( dirpath, filenames, src_exts, hdr_exts, - flat_source_leaf_dirpath, - flat_header_leaf_dirpath ) - -# ------------------------------------------------------------------------------ - -def copy_files_to_flat_subdirs( dirpath, filenames, src_exts, hdr_exts, src_dirpath, hdr_dirpath ): - - # Consider all files in dirpath. - for filename in filenames: - - # Construct the full file path for the current file. - filepath = os.path.join( dirpath, filename ) - - # Iterate over the valid source extensions for the current directory - # path. - for src_ext in src_exts: - - # If the filename/filepath ends with the source extension, copy it - # to the source subdirectory within the flat destination directory. - if filepath.endswith( src_ext ): - - # Take action only if this is not a dry run. - if dry_run_flag == False: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: copying to %s from %s" % ( script_name, src_dirpath, filepath ) - - # Copy the source file to the source subdirectory. - shutil.copy2( filepath, src_dirpath ) - - else: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: (dry-run) copying to %s from %s" % ( script_name, src_dirpath, filepath ) - - # Iterate over the valid header extensions for the current directory - # path. - for hdr_ext in hdr_exts: - - # If the filename/filepath ends with the header extension, copy it - # to the include subdirectory within the flat destination directory. - if filepath.endswith( hdr_ext ): - - # Take action only if this is not a dry run. - if dry_run_flag == False: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: copying to %s from %s" % ( script_name, hdr_dirpath, filepath ) - - # Copy the header file to the header subdirectory. - shutil.copy2( filepath, hdr_dirpath ) - - else: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: (dry-run) copying to %s from %s" % ( script_name, hdr_dirpath, filepath ) - -# ------------------------------------------------------------------------------ - -def read_ignore_list(): - - # Open the ignore list files as read-only. - ignore_file = open( ignore_list_path, 'r' ) - ignore_file_win = open( ignore_list_win_path, 'r' ) - - # Read all lines in the ignore list files. The items in these lists contain - # newlines, which we'll strip out shortly. - raw_list = ignore_file.readlines() - raw_win_list = ignore_file_win.readlines() - - # Close the files. - ignore_file.close() - ignore_file_win.close() - - # Initialize an empty ignore list for the stripped version of the raw list. - ignore_list = [] - - # Iterate over the first raw list. - for line in raw_list: - - # Append the stripped line to a new list. - ignore_list.append( line.strip() ) - - # Iterate over the second raw list. - for line in raw_win_list: - - # Append the stripped line to a new list. - ignore_list.append( line.strip() ) - - # Return the list of stripped lines. - return ignore_list - -# ------------------------------------------------------------------------------ - -def read_leaf_list(): - - # Open the leaf list file. - leaf_file = open( leaf_list_path, 'r' ) - - # Read the lines in the file. - line_list = leaf_file.readlines() - - # Start with a blank list. - leaf_list = [] - - # Iterate over the lines. - for line in line_list: - - # Split the specification by colon to separate the fields. - fields = string.split( string.strip( line ), ':' ) - - # Get the individual fields of the specification. - src_exts = string.split( fields[0], ',' ) - hdr_exts = string.split( fields[1], ',' ) - - # If it's a singleton list of an empty string, make it an empty list. - if len(src_exts) == 1: - if src_exts[0] == '': - src_exts = [] - - # If it's a singleton list of an empty string, make it an empty list. - if len(hdr_exts) == 1: - if hdr_exts[0] == '': - hdr_exts = [] - - # Pack the fields into a tuple. - leaf_spec = ( src_exts, hdr_exts ) - - - # Append the tuple to our list. - leaf_list.append( leaf_spec ) - - # Return the list. - return leaf_list - -# ------------------------------------------------------------------------------ - -# Begin by executing main(). -main() diff --git a/attic/windows/build/gen-check-rev-file.py b/attic/windows/build/gen-check-rev-file.py deleted file mode 100644 index 20593f76b..000000000 --- a/attic/windows/build/gen-check-rev-file.py +++ /dev/null @@ -1,252 +0,0 @@ -#! /usr/bin/env python -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - -# ------------------------------------------------------------------------------ - -# Import modules -import sys -import os -import os.path -import getopt - -# Global variables for command line options, with default settings. -script_name = "" -verbose_flag = False - -# Global constants -toplevel_dirpath = "." -svn_dirname = ".svn" -entries_filename = "entries" -revision_filename = "revision" -dummy_rev_string = "unknown" - - -# ------------------------------------------------------------------------------ - -def print_usage(): - - # Print help information. - print " " - print " %s" % script_name - print " " - print " Field G. Van Zee" - print " " - print " This script ensures that a revision file exists so nmake can include the" - print " revision number in the subdirectory paths to the build products." - print " " - print " If a .svn directory exists, the revision file is created (or updated)" - print " to contain the revision number contained in .svn\entries file." - print " Otherwise, if a .svn directory does not exist, the revision file is" - print " left untouched if it exists, and created with a dummy value if it does" - print " not." - print " " - print " This script is typically invoked by configure.cmd, but it can also be" - print " run manually." - print " " - print " Usage:" - print " %s" % script_name - print " " - print " The following options are accepted:" - print " " - print " -v verbose" - print " Be verbose. Output what's happening." - print " " - - # Exit the script. - sys.exit() - -# ------------------------------------------------------------------------------ - -def main(): - - # Extern our global veriables. - global script_name - global verbose_flag - - # Get the script name so we can use it in our output. - ( script_dir, script_name ) = os.path.split( sys.argv[0] ) - - try: - - # Get the command line options. - options, args = getopt.getopt( sys.argv[1:], "v") - - except getopt.GetoptError, err: - - # print help information and exit: - print str( err ) # will print something like "option -a not recognized" - print_usage() - - # Parse our expected command line options. - for o, a in options: - - if o == "-v": - verbose_flag = True - else: - assert False, "unhandled option" - - # Check the number of arguments after command line option processing. - n_args = len( args ) - if n_args != 0: - print_usage() - - # Construct the filepaths to the entries and revision files. - entries_filepath = os.path.join( toplevel_dirpath, svn_dirname, entries_filename ) - revision_filepath = os.path.join( toplevel_dirpath, revision_filename ) - - # Test for the existence of the entries file (and by proxy, a working copy). - entries_file_exists = file_exists( entries_filepath ) - - # If the entries file exists, we are in a working copy, and thus we can - # overwrite the revision file with a potentially new value. - if entries_file_exists == True: - - # Read the revision number from the entries file. - rev_num_str = read_revision_from_entries( entries_filepath ) - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: Found working copy; writing revision string \"%s\" to %s" % ( script_name, rev_num_str, revision_filepath ) - - # Write the revision number to the revision file. - write_revision_to_file( rev_num_str, revision_filepath ) - - # If we can't find the entries file, we probably are in an exported - # copy: either an official snapshot, or a copy that someone exported - # manually--hopefully (and likely) the former. - else: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: Found export. Checking for revision file..." % ( script_name ) - - # Test for the existence of the revision file. - rev_file_exists = file_exists( revision_filepath ) - - # If the revision file does not exist, create a dummy file so the - # configure script has something to work with. - if rev_file_exists == False: - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: Revision file not found. Writing dummy revision string \"%s\" to %s" % ( script_name, dummy_rev_string, revision_filepath ) - - # Write the dummy string to the revision file. - write_revision_to_file( dummy_rev_string, revision_filepath ) - - else: - - # Get the revision number from the file just for the purposes of - # being verbose, if it was requested. - rev_num_str = read_revision_file( revision_filepath ) - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: Revision file found containing revision string \"%s\". Export is valid snapshot!" % ( script_name, rev_num_str ) - - -# ------------------------------------------------------------------------------ - -def file_exists( filepath ): - - # Try to open the file read-only. - try: - - fp = open( filepath, 'r' ) - fp.close() - exists = True - - except IOError, err: - - exists = False - - return exists - - -# ------------------------------------------------------------------------------ - -def read_revision_from_entries( entries_filepath ): - - # Open the ignore list files as read-only. - entries_file = open( entries_filepath, 'r' ) - - # Read all lines in the entries file. - raw_list = entries_file.readlines() - - # Close the file. - entries_file.close() - - # Grab the fourth line, which is where the revision number lives, and strip - # it of whitespace (probably just a newline). - rev_num_str = raw_list[3].strip() - - # Return the revision number string. - return rev_num_str - -# ------------------------------------------------------------------------------ - -def write_revision_to_file( rev_string, revision_filepath ): - - # Open the revision file for writing. - revision_file = open( revision_filepath, 'w' ) - - # Write the revision string to the file. - revision_file.write( rev_string ) - - # Close the file. - revision_file.close() - -# ------------------------------------------------------------------------------ - -def read_revision_file( revision_filepath ): - - # Open the revision file. - revision_file = open( revision_filepath, 'r' ) - - # Read the first (and only) line. - line = revision_file.readline() - - # Close the file. - revision_file.close() - - # Grab the string and strip the it of whitespace (should just be a newline). - rev_num_str = line.strip() - - # Return the revision number string. - return rev_num_str - -# ------------------------------------------------------------------------------ - -# Begin by executing main(). -main() diff --git a/attic/windows/build/gen-config-file.py b/attic/windows/build/gen-config-file.py deleted file mode 100644 index 557083276..000000000 --- a/attic/windows/build/gen-config-file.py +++ /dev/null @@ -1,360 +0,0 @@ -#! /usr/bin/env python -# -# BLIS -# An object-based framework for developing high-performance BLAS-like -# libraries. -# -# Copyright (C) 2014, The University of Texas at Austin -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# - Neither the name(s) of the copyright holder(s) nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# - -# ------------------------------------------------------------------------------ - -# Import modules -import sys -import os -import os.path -import getopt -import re -import string - -# Global variables for command line options, with default settings. -script_name = "" -dry_run_flag = False -verbose_flag = False - -# Global constants -config_dirname = "config" -source_dirname = "frame" -object_dirname = "obj" -object_extension = ".obj" -leaf_list_path = "build/leaf_list" -revision_filename = "revision" -rev_varname = "REVISION" -pwd_varname = "PWD" -arch_varname = "ARCH_STR" -build_varname = "BUILD_STR" -ccompiler_varname = "CCOMPILER_STR" - - -# ------------------------------------------------------------------------------ - -def print_usage(): - - # Print help information. - print " " - print " %s" % script_name - print " " - print " Field G. Van Zee" - print " " - print " Create a config.mk file that is to be included by the nmake Makefile." - print " This config.mk file is based on a template, but also includes variable" - print " definitions that are needed for the specific build were are performing." - print " The variables which are currently appended to config.mk at runtime are:" - print " - the revision string" - print " - the path to the current working directory" - print " - the build string (e.g. debug, release)" - print " - the architecture string (e.g. x86, x64)" - print " - the C compiler to use (e.g. icl, cl)" - print " - a list of paths to the object files to be compiled" - print " The config.mk file is placed within the config subdirectory." - print " " - print " Usage:" - print " %s [options] flat_dir arch build ccompiler path\\to\\config.mk.in" % script_name - print " " - print " The following options are accepted:" - print " " - print " -d dry-run" - print " Go through all the motions, but don't actually output" - print " the nmake definition file." - print " -v verbose" - print " Be verbose about actions (one line of output her action)." - print " " - - # Exit the script. - sys.exit() - -# ------------------------------------------------------------------------------ - -def main(): - - # Extern our global veriables. - global script_name - global dry_run_flag - global verbose_flag - - # Get the script name so we can use it in our output. - ( script_dir, script_name ) = os.path.split( sys.argv[0] ) - - try: - - # Get the command line options. - options, args = getopt.getopt( sys.argv[1:], "dv") - - except getopt.GetoptError, err: - - # print help information and exit: - print str( err ) # will print something like "option -a not recognized" - print_usage() - - # Parse our expected command line options. - for o, a in options: - - if o == "-d": - dry_run_flag = True - elif o == "-v": - verbose_flag = True - else: - assert False, "unhandled option" - - # Check the number of arguments after command line option processing. - n_args = len( args ) - if n_args != 5: - print_usage() - - # Acquire the non-optional arguments. - flat_dir = args[0] - arch_string = args[1] - build_string = args[2] - ccompiler_string = args[3] - input_filepath = args[4] - - # Acquire the list of leaf-type directories we will descend into. - leaf_list = read_leaf_list() - - # Read the contents of the template file. - template_file_line_list = read_template_file( input_filepath ) - - # Initialize a new list for the lines to be output - output_file_line_list = template_file_line_list - - # Read the revision number from the revision file. - rev_num_str = read_revision_file( revision_filename ) - - # Add a variable for the revision number of the code we're working with. - rev_var_value = rev_varname + " = " + rev_num_str + "\n" - output_file_line_list.append( rev_var_value ) - - # Add a variable for the path to the current working directory and append - # it to our list. - pwd_var_value = pwd_varname + " = " + os.getcwd() + "\n" - output_file_line_list.append( pwd_var_value ) - - # Add a variable for the architecture string and append it to our list. - arch_var_value = arch_varname + " = " + arch_string + "\n" - output_file_line_list.append( arch_var_value ) - - # Add a variable for the build type string and append it to our list. - build_var_value = build_varname + " = " + build_string + "\n" - output_file_line_list.append( build_var_value ) - - # Add a variable for the C compiler string and append it to our list. - ccompiler_var_value = ccompiler_varname + " = " + ccompiler_string + "\n" - output_file_line_list.append( ccompiler_var_value ) - - # Walk the flat subdirectories for each of the leaves. - for leaf_spec in leaf_list: - - # Unpack the leaf_spec tuple. - src_exts, hdr_exts = leaf_spec - - # Create the paths to the source and object subdirectories. - src_dirpath = os.path.join( flat_dir, source_dirname ) - obj_dirpath = os.path.join( flat_dir, object_dirname, arch_string, build_string ) - - # Get a list of files from the leaf subdirectory. - src_filenames = os.listdir( src_dirpath ) - - # This will be the nmake variable name to which we will assign the list - # of source files. - nmake_varname = "BLIS_OBJS" - - # Generate the line to output. - leaf_line = generate_object_list( nmake_varname, src_filenames, src_exts, obj_dirpath ) - - # Accumulate the lines. - output_file_line_list.append( leaf_line ) - - # Get the filename part of the input filepath. - input_filedir, input_filename = os.path.split( input_filepath ) - - # Remove the .in extension in the output filename. - output_filename = re.sub( '.mk.in', '.mk', input_filename ) - - # Construct the filepath for the output file. - output_filepath = os.path.join( flat_dir, config_dirname, output_filename ) - - # Write the output lines. - write_output_file( output_filepath, output_file_line_list ) - -# ------------------------------------------------------------------------------ - -def read_revision_file( filepath ): - - # Try to open the revision file. - try: - - revision_file = open( filepath, 'r' ) - - except IOError, err: - - print "%s: Couldn't open revision file %s" % ( script_name, filepath ) - sys.exit(1) - - # Read the first (and only) line. - line = revision_file.readline() - - # Close the file. - revision_file.close() - - # Grab the string and strip the it of whitespace (should just be a newline). - rev_num_str = line.strip() - - # Return the revision number string. - return rev_num_str - -# ------------------------------------------------------------------------------ - -def generate_object_list( nmake_varname, src_filenames, src_exts, obj_dirpath ): - - # Initialize the string as an assignment operation. - the_line = nmake_varname + " = " - - # Return early if there are no source extensions for this leaf spec. - if src_exts == []: - return "" - - # Construct a pattern to match any file ending with any of the source file - # extensions given. This string is going to look something like ".[cf]". - src_pattern = '\.[' - for src_ext in src_exts: - src_pattern = src_pattern + src_ext - src_pattern = src_pattern + ']' - - # Consider all source files. - for src_filename in src_filenames: - - obj_filename = re.sub( src_pattern, '.obj', src_filename ) - - # Create the full path to the file. - obj_filepath = os.path.join( obj_dirpath, obj_filename ) - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: adding file %s" % ( script_name, obj_filepath ) - - # And then add it to the list. - the_line = the_line + obj_filepath + " " - - # Be verbose if verbosity was requested. - if verbose_flag == True: - print "%s: %s" % ( script_name, the_line ) - - # Append a newline to the end of the line, for file.writelines(). - the_line = the_line + "\n" - - # Return the new line. - return the_line - -# ------------------------------------------------------------------------------ - -def read_template_file( template_filepath ): - - # Open the template file as read-only. - template_file = open( template_filepath, 'r' ) - - # Read all lines in the template file. - template_file_lines = template_file.readlines() - - # Close the file. - template_file.close() - - # Return the list of lines in the template file. - return template_file_lines - -# ------------------------------------------------------------------------------ - -def write_output_file( output_filepath, output_lines ): - - # Take action only if this is not a dry run. - if dry_run_flag == False: - - # Open the template file as writable. - output_file = open( output_filepath, 'w' ) - - # Write the lines. - output_file.writelines( output_lines ) - - # Close the file. - output_file.close() - -# ------------------------------------------------------------------------------ - -def read_leaf_list(): - - # Open the leaf list file. - leaf_file = open( leaf_list_path, 'r' ) - - # Read the lines in the file. - line_list = leaf_file.readlines() - - # Start with a blank list. - leaf_list = [] - - # Iterate over the lines. - for line in line_list: - - # Split the specification by colon to separate the fields. - fields = string.split( string.strip( line ), ':' ) - - # Get the individual fields of the specification. - src_exts = string.split( fields[0], ',' ) - hdr_exts = string.split( fields[1], ',' ) - - # If it's a singleton list of an empty string, make it an empty list. - if len(src_exts) == 1: - if src_exts[0] == '': - src_exts = [] - - # If it's a singleton list of an empty string, make it an empty list. - if len(hdr_exts) == 1: - if hdr_exts[0] == '': - hdr_exts = [] - - # Pack the fields into a tuple. - leaf_spec = ( src_exts, hdr_exts ) - - # Append the tuple to our list. - leaf_list.append( leaf_spec ) - - # Return the list. - return leaf_list - -# ------------------------------------------------------------------------------ - -# Begin by executing main(). -main() diff --git a/attic/windows/build/ignore_list b/attic/windows/build/ignore_list deleted file mode 100644 index a8230623e..000000000 --- a/attic/windows/build/ignore_list +++ /dev/null @@ -1,7 +0,0 @@ -attic -broken -old -other -temp -tmp -test diff --git a/attic/windows/build/ignore_list.windows b/attic/windows/build/ignore_list.windows deleted file mode 100644 index 46f8b9aac..000000000 --- a/attic/windows/build/ignore_list.windows +++ /dev/null @@ -1 +0,0 @@ -.git diff --git a/attic/windows/build/leaf_list b/attic/windows/build/leaf_list deleted file mode 100644 index 98e115e3f..000000000 --- a/attic/windows/build/leaf_list +++ /dev/null @@ -1 +0,0 @@ -c:h diff --git a/attic/windows/build/nmake-help.cmd b/attic/windows/build/nmake-help.cmd deleted file mode 100644 index a46ce5f1a..000000000 --- a/attic/windows/build/nmake-help.cmd +++ /dev/null @@ -1,72 +0,0 @@ -:: -:: -:: BLIS -:: An object-based framework for developing high-performance BLAS-like -:: libraries. -:: -:: Copyright (C) 2014, The University of Texas at Austin -:: -:: Redistribution and use in source and binary forms, with or without -:: modification, are permitted provided that the following conditions are -:: met: -:: - Redistributions of source code must retain the above copyright -:: notice, this list of conditions and the following disclaimer. -:: - Redistributions in binary form must reproduce the above copyright -:: notice, this list of conditions and the following disclaimer in the -:: documentation and/or other materials provided with the distribution. -:: - Neither the name(s) of the copyright holder(s) nor the names of its -:: contributors may be used to endorse or promote products derived -:: from this software without specific prior written permission. -:: -:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -:: -:: - -@echo off - -echo. -echo Makefile -echo. -echo Field G. Van Zee -echo. -echo nmake Makefile for building BLIS for Microsoft Windows. nmake targets -echo may be invoked after running the configure.cmd script. Valid targets are: -echo. -echo all - Invoke the lib and dll targets. -echo lib - Build BLIS as a static library. -echo dll - Build BLIS as a dynamically-linked library. -echo help - Output help and usage information. -echo clean - Invoke clean-log and clean-build targets. -echo clean-log - Remove any log files present. -echo clean-config - Remove all products of configure.cmd. Namely, remove the -echo config, include, and src directories. -echo clean-build - Remove all products of the compilation portion of the build -echo process. Namely, remove the obj, lib, and dll directories. -echo distclean - Invoke clean-log, clean-config, and clean-build targets. -echo. -echo The Makefile also recognizes configuration options corresponding to the -echo following Makefile variables: -echo. -echo VERBOSE - When defined, nmake outputs the actual commands -echo executed instead of more concise one-line progress -echo indicators. (Undefined by default.) -echo. -echo Typically, these options are specified by commenting or uncommenting the -echo corresponding lines in the Makefile. However, if the Makefile currently does -echo not define one of the options, and you wish to enable the corresponding -echo feature without editing the Makefile, you may define the variable at the -echo command line when nmake is invoked. For example, you may enable verboseness -echo while invoking the lib target as follows: -echo. -echo nmake lib VERBOSE=1 -echo. diff --git a/attic/windows/configure.cmd b/attic/windows/configure.cmd deleted file mode 100644 index c2ee037d7..000000000 --- a/attic/windows/configure.cmd +++ /dev/null @@ -1,87 +0,0 @@ -:: -:: -:: BLIS -:: An object-based framework for developing high-performance BLAS-like -:: libraries. -:: -:: Copyright (C) 2014, The University of Texas at Austin -:: -:: Redistribution and use in source and binary forms, with or without -:: modification, are permitted provided that the following conditions are -:: met: -:: - Redistributions of source code must retain the above copyright -:: notice, this list of conditions and the following disclaimer. -:: - Redistributions in binary form must reproduce the above copyright -:: notice, this list of conditions and the following disclaimer in the -:: documentation and/or other materials provided with the distribution. -:: - Neither the name(s) of the copyright holder(s) nor the names of its -:: contributors may be used to endorse or promote products derived -:: from this software without specific prior written permission. -:: -:: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -:: "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -:: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -:: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -:: HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -:: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -:: LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -:: DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -:: THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -:: (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -:: OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -:: -:: - -@echo off - -:ENVIRONMENT - set GEN_CHECK_REV_FILE=.\build\gen-check-rev-file.py - set GATHER_SRC=.\build\gather-src-for-windows.py - set GEN_CONFIG_FILE=.\build\gen-config-file.py - set CONFIG_DEFS_TEMPL=.\build\config.mk.in - set SRC_TREE_DIR=..\frame - set TOP_BUILD_DIR=. - -:PARAMS - if "%1"=="" (goto USAGE) - if "%2"=="" (goto USAGE) - if "%3"=="" (goto USAGE) - - set ARCH=%1 - set BUILD=%2 - set CCOMPILER=%3 - -:TASK_UNIT - echo %0: Checking/updating revision file. - %GEN_CHECK_REV_FILE% -v - echo %0: Gathering source files into local flat directories. - %GATHER_SRC% %SRC_TREE_DIR% %TOP_BUILD_DIR% - echo %0: Creating configure definitions file. - %GEN_CONFIG_FILE% %TOP_BUILD_DIR% %ARCH% %BUILD% %CCOMPILER% %CONFIG_DEFS_TEMPL% - echo %0: Configuration and setup complete. You may now run nmake. - - goto END - -:USAGE - echo. - echo configure.cmd - echo. - echo A wrapper script for various configuration and setup scripts that need - echo. to be run before nmake when building BLIS for Microsoft Windows. - echo. - echo USAGE: - echo %0 [arch] [build] [cc] - echo. - echo arch -- The architecture string to build. - echo Supported values: {x86,x64} - echo build -- The kind of build. - echo Supported values: {debug,release} - echo cc -- The C compiler to use. - echo Supported values: {icl,cl} - echo. - echo examples: - echo %0 x86 debug icl - echo %0 x64 release cl - echo. - -:END diff --git a/attic/windows/gendll.cmd b/attic/windows/gendll.cmd deleted file mode 100644 index db0cdc1d2..000000000 --- a/attic/windows/gendll.cmd +++ /dev/null @@ -1,128 +0,0 @@ -@echo off -@setlocal enabledelayedexpansion - -rem -------------------------------------------------------------------- -rem Build a dll out of a set of object files specified by the -rem argument /objlist. -rem -rem The .lib file thus created is an "import" library, which one links -rem with, but the bulk of the code ends up in the associated .dll file. -rem --------------------------------------------------------------------- - -set THIS_SCRIPT=%~dp0%~nx0 - -if "%1"=="" goto USAGE -if "%2"=="" goto USAGE -if "%3"=="" goto USAGE -if "%4"=="" goto USAGE -if "%5"=="" goto USAGE - -set gd_lib_name=%1 -set gd_link=%gd_lib_name%-static.link -set LINKER=%3 -set LINKARGSFILE=%4 -set gd_def=%5 - -:PARSE_ARGS -set IMPORT= -set OBJLIST= -:ARGLOOP -if "%6"=="" goto ENDARGLOOP -if /i not "%6"=="/import" goto OBJARG -set IMPORT=!IMPORT! %7 -goto SHIFT -:OBJARG -if /i not "%6"=="/objlist" goto ENDARGLOOP -set OBJLIST=%7 -:SHIFT -shift /4 -shift /4 -goto ARGLOOP -:ENDARGLOOP - -if defined OBJLIST goto COMPILER_SETUP -echo Error: must supply /objlist -goto USAGE - -:COMPILER_SETUP -set gd_path=%2 -set gd_dll_path=%gd_path%.dll -set gd_main_c=dll_main__%gd_lib_name%.c -set gd_main_obj=dll_main__%gd_lib_name%.obj - -rem create C file for dll_main -for /F "tokens=*" %%i in ("#include ") do echo %%i >%gd_main_c% -echo. >>%gd_main_c% -echo BOOLEAN WINAPI DllMain( >>%gd_main_c% -echo HINSTANCE hDllHandle, >>%gd_main_c% -echo DWORD nReason, >>%gd_main_c% -echo LPVOID Reserved){ >>%gd_main_c% -echo. >>%gd_main_c% -echo BOOLEAN bSuccess = TRUE;>>%gd_main_c% -echo. >>%gd_main_c% -echo switch (nReason){ >>%gd_main_c% -echo case DLL_PROCESS_ATTACH: >>%gd_main_c% -echo DisableThreadLibraryCalls( hDllHandle ); >>%gd_main_c% -echo break; >>%gd_main_c% -echo case DLL_PROCESS_DETACH: >>%gd_main_c% -echo break; >>%gd_main_c% -echo. >>%gd_main_c% -echo }; >>%gd_main_c% -echo. >>%gd_main_c% -echo return bSuccess; >>%gd_main_c% -echo }; >>%gd_main_c% -echo.>>%gd_main_c% - -rem set up link file by specifying dll filepath and main object -echo /Fe%gd_dll_path% > %gd_link% -echo %gd_main_obj% >> %gd_link% - -rem add contents of linkargs file; most of the link argument action is -rem in this file -type %LINKARGSFILE% >> %gd_link% - -rem add command-line import libraries, if any -if defined IMPORT echo !IMPORT! >> %gd_link% - -rem add export specification -echo %gd_def% >> %gd_link% - -rem add contents of OBJLIST file -type %OBJLIST% >> %gd_link% - -rem create dll, import lib, and export file -%LINKER% /nologo /c /O2 /Fo%gd_main_obj% %gd_main_c% >> gendll-cl.log -%LINKER% @%gd_link% - -:CLEANUP -del /F /Q %gd_link% %gd_main_c% %gd_main_obj% gendll-cl.log -goto END - - -:USAGE -echo. -echo. gendll.cmd -echo. -echo. Generate a dynamically-linked library from a set of object files -echo. specified in objlist_file. -echo. -echo. Usage: -echo. %0 dllname dllpath linker linkargs_file symbols_file {/import importlib} /objlist objlist_file -echo. -echo. dllname -- the name of the DLL being created, with no extension. -echo. dllpath -- the path to the DLL being created, with no extension. -echo. linker -- the compiler to use to link the DLL. -echo. linkargs_file -- the path to a file containing a list of all linker -echo. arguments--link options, libraries, and library paths-- -echo. that that may be needed to successfully link the DLL -echo. being created. -echo. symbols_file -- the path to a file containing a list of symbols to -echo. export in the DLL. -echo. importlib -- the path to a .lib library that you wish to import into -echo. the DLL being created. Optional. -echo. objlist_file -- the path to a file containing the list of object files -echo. that make up the bulk of the DLL being created. -echo. - -:END -endlocal diff --git a/attic/windows/linkargs.txt b/attic/windows/linkargs.txt deleted file mode 100644 index 61be998da..000000000 --- a/attic/windows/linkargs.txt +++ /dev/null @@ -1,11 +0,0 @@ -/nologo -/LD /MT -/LIBPATH:"C:\Program Files\Microsoft SDKs\Windows\v6.0A\Lib" -/LIBPATH:"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib" -/nodefaultlib:libcmt /nodefaultlib:libc /nodefaultlib:libmmt -msvcrt.lib -/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\lib\ia32" -/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\mkl\ia32\lib" -mkl_intel_c.lib -mkl_sequential.lib -mkl_core.lib diff --git a/attic/windows/linkargs64.txt b/attic/windows/linkargs64.txt deleted file mode 100644 index 35df4bba9..000000000 --- a/attic/windows/linkargs64.txt +++ /dev/null @@ -1,11 +0,0 @@ -/nologo -/LD /MT -/LIBPATH:"C:\Program Files\Microsoft SDKs\Windows\v6.0A\Lib\x64" -/LIBPATH:"C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\lib\amd64" -/nodefaultlib:libcmt /nodefaultlib:libc /nodefaultlib:libmmt -msvcrt.lib -/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\lib\intel64" -/LIBPATH:"C:\Program Files (x86)\Intel\Compiler\11.1\048\mkl\em64t\lib" -mkl_intel_lp64.lib -mkl_sequential.lib -mkl_core.lib diff --git a/attic/windows/revision b/attic/windows/revision deleted file mode 100644 index 87edf799f..000000000 --- a/attic/windows/revision +++ /dev/null @@ -1 +0,0 @@ -unknown \ No newline at end of file diff --git a/attic/windows/vc110.pdb b/attic/windows/vc110.pdb deleted file mode 100644 index 39ecfdbbb..000000000 Binary files a/attic/windows/vc110.pdb and /dev/null differ diff --git a/common.mk b/common.mk index cc53eaad2..0bb338465 100644 --- a/common.mk +++ b/common.mk @@ -212,6 +212,11 @@ get-sandbox-cxxtext-for = "('$(1)' CXXFLAGS for sandboxes)" files-that-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),$(f),))) files-that-dont-contain = $(strip $(foreach f, $(1), $(if $(findstring $(2),$(f)),,$(f)))) +# Define a function that removes duplicate words from a list. +# NOTE: This function was obtained via [1]; thanks bobbogo for this +# concise definition. +# [1] https://stackoverflow.com/questions/16144115/makefile-remove-duplicate-words-without-sorting +rm-dupls = $(if $1,$(firstword $1) $(call rm-dupls,$(filter-out $(firstword $1),$1))) # @@ -535,7 +540,7 @@ endif ifeq ($(OS_NAME),Darwin) # OS X shared library link flags. SOFLAGS := -dynamiclib -SOFLAGS += -Wl,-install_name,$(LIBBLIS_SONAME) +SOFLAGS += -Wl,-install_name,$(libdir)/$(LIBBLIS_SONAME) else SOFLAGS := -shared ifeq ($(IS_WIN),yes) @@ -833,9 +838,6 @@ endif # --- LDFLAGS cleanup ---------------------------------------------------------- # -# Remove duplicate flags/options in LDFLAGS (such as -lpthread) by sorting. -LDFLAGS := $(sort $(LDFLAGS)) - # @@ -1080,4 +1082,3 @@ BUILD_CPPFLAGS := -DBLIS_IS_BUILDING_LIBRARY # end of ifndef COMMON_MK_INCLUDED conditional block endif - diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c index 71d8aee06..5bc713aae 100644 --- a/config/haswell/bli_cntx_init_haswell.c +++ b/config/haswell/bli_cntx_init_haswell.c @@ -90,9 +90,11 @@ void bli_cntx_init_haswell( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 10, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c index e00b2a8dc..6da3b7a3a 100644 --- a/config/knl/bli_cntx_init_knl.c +++ b/config/knl/bli_cntx_init_knl.c @@ -79,9 +79,11 @@ void bli_cntx_init_knl( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 10, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c index c3640b623..9e1d03503 100644 --- a/config/old/haswellbb/bli_cntx_init_haswell.c +++ b/config/old/haswellbb/bli_cntx_init_haswell.c @@ -150,9 +150,11 @@ void bli_cntx_init_haswell( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 10, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c index 6c9c1b918..4370ce26c 100644 --- a/config/power9/bli_cntx_init_power9.c +++ b/config/power9/bli_cntx_init_power9.c @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -35,11 +35,33 @@ #include "blis.h" // Instantiate prototypes for packm kernels. +PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref ) PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) // Instantiate prototypes for level-3 kernels. -//GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) +GEMM_UKR_PROT( float, s, gemmbb_power9_ref ) +GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref ) +GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref ) +TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref ) +TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref ) +GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) +GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref ) +GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref ) +TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref ) +TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref ) + +GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref ) +GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref ) +GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref ) +TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref ) +TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref ) + +GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref ) +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref ) +GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref ) +TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref ) +TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref ) void bli_cntx_init_power9( cntx_t* cntx ) { @@ -47,25 +69,56 @@ void bli_cntx_init_power9( cntx_t* cntx ) // Set default kernel blocksizes and functions. bli_cntx_init_power9_ref( cntx ); - + + // ------------------------------------------------------------------------- + // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 1, - //BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, + 12, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE, + BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE, + BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE, + + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, + + BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE, + BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE, + BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE, + BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE, + BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE, + BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE, + cntx + ); + + // Update the context with customized virtual [gemm]trsm micro-kernels. + bli_cntx_set_l3_vir_ukrs + ( + 8, + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( - 1, + 2, + BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, cntx ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 ); bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1, -1, 12, -1, -1 ); @@ -73,6 +126,9 @@ void bli_cntx_init_power9( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, @@ -84,5 +140,5 @@ void bli_cntx_init_power9( cntx_t* cntx ) BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); -} +} diff --git a/config/power9/bli_family_power9.h b/config/power9/bli_family_power9.h index 702e6ad5b..12b16444f 100644 --- a/config/power9/bli_family_power9.h +++ b/config/power9/bli_family_power9.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -38,4 +38,9 @@ #define BLIS_POOL_ADDR_OFFSET_SIZE_A 192 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 152 - +// Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of +// elements within the packed matrix B. +#define BLIS_DISABLE_HEMM_RIGHT +#define BLIS_DISABLE_SYMM_RIGHT +#define BLIS_DISABLE_TRMM_RIGHT +#define BLIS_DISABLE_TRMM3_RIGHT diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk index 1130f9d94..e4592c24a 100644 --- a/config/power9/make_defs.mk +++ b/config/power9/make_defs.mk @@ -5,7 +5,7 @@ # An object-based framework for developing high-performance BLAS-like # libraries. # -# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index f030ca54b..4df681cf4 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -71,9 +71,11 @@ void bli_cntx_init_skx( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 10, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 202dbb78c..f0807e4dc 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -83,9 +83,11 @@ void bli_cntx_init_zen( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 16, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 2a6d8c8c8..ce4fd470f 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -89,9 +89,12 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_cntx_set_l1v_kers ( 16, +#if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, +#endif + // axpyv // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, diff --git a/config_registry b/config_registry index fd7ecee11..ad6fc14c8 100644 --- a/config_registry +++ b/config_registry @@ -4,7 +4,7 @@ # Please refer to the BLIS wiki on configurations for information on the # syntax and semantics of this file [1]. # -# [1] https://github.com/flame/blis/wiki/ConfigurationHowTo +# [1] https://github.com/flame/blis/blob/master/docs/ConfigurationHowTo.md # # Processor families. diff --git a/configure b/configure index b8098ce5a..35d4f1d45 100755 --- a/configure +++ b/configure @@ -1323,14 +1323,18 @@ get_compiler_version() # isolate the version number. # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. - cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; }) + cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM|oneAPI' | { read first rest ; echo $first ; }) if [ "${cc_vendor}" = "icc" -o \ "${cc_vendor}" = "gcc" ]; then cc_version=$(${cc} -dumpversion) #if compiler is AOCC, first grep for clang and then the version number. elif [ "${cc_vendor}" = "clang" ]; then cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*') - else + elif [ "${cc_vendor}" = "oneAPI" ]; then + # Treat Intel oneAPI's clang as clang, not icc. + cc_vendor="clang" + cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) + else cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) fi @@ -1449,6 +1453,15 @@ check_compiler() blacklistcc_add "skx" fi fi + if [ ${cc_major} -eq 18 ]; then + echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details." + blacklistcc_add "knl" + blacklistcc_add "skx" + fi + if [ ${cc_major} -ge 19 ]; then + echo "${script_name}: ${cc} ${cc_version} is known to cause erroneous results. See https://github.com/flame/blis/issues/371 for details." + echoerr_unsupportedcc + fi fi # clang @@ -2421,8 +2434,22 @@ main() # Call the auto_detect() function and save the returned string in # config_name. config_name=$(auto_detect) + #config_name="generic" echo "${script_name}: hardware detection driver returned '${config_name}'." + + # If the auto-detect code returned the "generic" string, it means we + # were unable to automatically detect the user's hardware type. While + # this is going to be a rare event, it will likely lead the user to + # experience much lower performance than expected, and thus we will + # warn them about it at the end of the configure output (to increase + # the chances that they see it). + if [ "${config_name}" = "generic" ]; then + + warn_user_generic=1 + else + warn_user_generic=0 + fi else # Use the command line argument as the configuration name. @@ -3476,6 +3503,18 @@ main() echo "${script_name}: configured to build within top-level directory of source distribution." fi + if [ "${warn_user_generic}" = "1" ]; then + + echo "${script_name}: " + echo "${script_name}: *** Unable to automatically detect hardware type! ***" + echo "${script_name}: " + echo "${script_name}: NOTE: configure was unable to identify a subconfiguration" + echo "${script_name}: optimized for your hardware. As a result, the 'generic'" + echo "${script_name}: subconfiguration (with low-performance reference kernels)" + echo "${script_name}: will be used. For support, please open an issue on GitHub" + echo "${script_name}: at https://github.com/flame/blis/issues." + echo "${script_name}: " + fi # Exit peacefully. return 0 diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md index c5a224daf..f2ab8864b 100644 --- a/docs/BLISTypedAPI.md +++ b/docs/BLISTypedAPI.md @@ -1051,6 +1051,7 @@ void bli_?her2 ( uplo_t uploa, conj_t conjx, + conj_t conjy, dim_t m, ctype* alpha, ctype* x, inc_t incx, @@ -1115,6 +1116,7 @@ void bli_?syr2 ( uplo_t uploa, conj_t conjx, + conj_t conjy, dim_t m, ctype* alpha, ctype* x, inc_t incx, diff --git a/docs/BuildSystem.md b/docs/BuildSystem.md index 871800980..0906d1c05 100644 --- a/docs/BuildSystem.md +++ b/docs/BuildSystem.md @@ -27,6 +27,7 @@ The BLIS build system was designed for use with GNU/Linux (or some other sane UN * GNU `bash` (3.2 or later) * GNU `make` (3.81 or later) * a working C99 compiler + * Perl (any version) BLIS also requires a POSIX threads library at link-time (`-lpthread` or `libpthread.so`). This requirement holds even when configuring BLIS with multithreading disabled (the default) or with multithreading via OpenMP (`--enable-multithreading=openmp`). (Note: BLIS implements basic pthreads functionality automatically for Windows builds via [AppVeyor](https://ci.appveyor.com/project/shpc/blis/).) diff --git a/docs/HardwareSupport.md b/docs/HardwareSupport.md index adba02f19..32e5c4a63 100644 --- a/docs/HardwareSupport.md +++ b/docs/HardwareSupport.md @@ -15,6 +15,7 @@ A few remarks / reminders: * Induced complex (1m) implementations are employed in all situations where the real domain [gemm microkernel](KernelsHowTo.md#gemm-microkernel) of the corresponding precision is available, but the "native" complex domain gemm microkernel is unavailable. Note that the table below lists native kernels, so if a microarchitecture lists only `sd`, support for both `c` and `z` datatypes will be provided via the 1m method. (Note: most people cannot tell the difference between native and 1m-based performance.) Please see our [ACM TOMS article on the 1m method](https://github.com/flame/blis#citations) for more info on this topic. * Some microarchitectures use the same sub-configuration. *This is not a typo.* For example, Haswell and Broadwell systems as well as "desktop" (non-server) versions of Skylake, Kaby Lake, and Coffee Lake all use the `haswell` sub-configuration and the kernels registered therein. Microkernels can be recycled in this manner because the key detail that determines level-3 performance outcomes is actually the vector ISA, not the microarchitecture. In the previous example, all of the microarchitectures listed support AVX2 (but not AVX-512), and therefore they can reuse the same microkernels. * Remember that you (usually) don't have to choose your sub-configuration manually! Instead, you can always request configure-time hardware detection via `./configure auto`. This will defer to internal logic (based on CPUID for x86_64 systems) that will attempt to choose the appropriate sub-configuration automatically. + * There is a difficulty in automatically choosing the ideal sub-configuration for use on Skylake-X systems, which may have one or two FMA units. The `skx` sub-configuration is only beneficial when used on hardware with two FMA units. Otherwise the hardware is treated as a "desktop" Skylake system, which uses the `haswell` sub-configuration. Furthermore, the number of units can't be queried directly; instead, we rely on a manually-maintained list of CPU models (via logic in `frame/base/bli_cpuid.c`), which may be incorrect for new processors, particularly Gold models. In that case, you can either fix the code (and please raise an issue!) or manually target the `skx` at configure-time (i.e., `./configure [options] skx`). If your performance seems low, you can set `export BLIS_ARCH_DEBUG=1`, which will cause BLIS to output some basic debugging info to `stderr` that will reveal whether your system was detected as having one or two VPUs (FMA units). | Vendor/Microarchitecture | BLIS sub-configuration | `gemm` | `gemmtrsm` | |:-------------------------------------|:-----------------------|:-------|:-----------| @@ -28,7 +29,8 @@ A few remarks / reminders: | Intel Haswell, Broadwell (AVX/FMA3) | `haswell` | `sdcz` | `sd` | | Intel Sky/Kaby/CoffeeLake (AVX/FMA3) | `haswell` | `sdcz` | `sd` | | Intel Knights Landing (AVX-512/FMA3) | `knl` | `sd` | | -| Intel SkylakeX (AVX-512/FMA3) | `skx` | `sd` | | +| Intel SkylakeX (AVX-512/2×FMA3) | `skx` | `sd` | | +| Intel SkylakeX (AVX-512/1×FMA3) | `haswell` | `sdcz` | `sd` | | ARMv7 Cortex-A9 (NEON) | `cortex-a9` | `sd` | | | ARMv7 Cortex-A15 (NEON) | `cortex-a15` | `sd` | | | ARMv8 Cortex-A53 (NEON) | `cortex-a53` | `sd` | | diff --git a/docs/KernelsHowTo.md b/docs/KernelsHowTo.md index e3dfd125f..d42c4ee93 100644 --- a/docs/KernelsHowTo.md +++ b/docs/KernelsHowTo.md @@ -278,7 +278,7 @@ void bli_?gemm_ukernel The `gemm` microkernel, sometimes simply referred to as "the BLIS microkernel" or "the microkernel", performs the following operation: ``` - C11 := beta * C11 + A1 * B1 + C11 := beta * C11 + alpha * A1 * B1 ``` where `A1` is an _MR x k_ "micropanel" matrix stored in packed (column-wise) format, `B1` is a _k x NR_ "micropanel" matrix stored in packed (row-wise) format, `C11` is an _MR x NR_ general matrix stored according to its row and column strides `rsc` and `csc`, and `alpha` and beta are scalars. diff --git a/docs/PerformanceSmall.md b/docs/PerformanceSmall.md index 09b1dbd8c..ceb6d4156 100644 --- a/docs/PerformanceSmall.md +++ b/docs/PerformanceSmall.md @@ -35,14 +35,18 @@ sizes tested. Each of the 28 graphs within a panel will contain an x-axis that reports problem size, with one, two, or all three matrix dimensions equal to the problem size (e.g. _m_ = 6; _n_ = _k_, also encoded as `m6npkp`). -The y-axis will report in units GFLOPS (billions of floating-point operations -per second) on a single core. +The y-axis will report in units GFLOPS (or billions of floating-point operations +per second) per core. -It's also worth pointing out that the top of each graph (e.g. the maximum -y-axis value depicted) _always_ corresponds to the theoretical peak performance -under the conditions associated with that graph. -Theoretical peak performance, in units of GFLOPS, is calculated as the -product of: +It's also worth pointing out that the top of some graphs (e.g. the maximum +y-axis value depicted) correspond to the theoretical peak performance +under the conditions associated with that graph, while in other graphs the +y-axis has been adjusted to better show the difference between the various +curves. (We *strongly* prefer to always use peak performance as the top of +the graph; however, this is one of the few exceptions where we feel some +scaling is warranted.) +Theoretical peak performance on a single core, in units of GFLOPS, is +calculated as the product of: 1. the maximum sustainable clock rate in GHz; and 2. the maximum number of floating-point operations (flops) that can be executed per cycle. @@ -60,30 +64,32 @@ can be issued per cycle (per core); register (for the datatype in question); and 3. 2.0, since an FMA instruction fuses two operations (a multiply and an add). -The problem size range, represented on the x-axis, is sampled in -increments of 4 up to 800 for the cases where one or two dimensions is small -(and constant) -and up to 400 in the case where all dimensions (e.g. _m_, _n_, and _k_) are -bound to the problem size (i.e., square matrices). +Typically, organizations and individuals publish performance with square +matrices, which can miss the problem sizes of interest to many applications. +Here, in addition to square matrices (shown in the seventh column), we also +show six other scenarios where one or two `gemm` dimensions (of _m,_ _n_, and +_k_) is small. In these six columns, the constant small matrix dimensions were +chosen to be _very_ small--in the neighborhood of 8--intentionally to showcase +what happens when at least one of the matrices is abnormally "skinny." -Note that the constant small matrix dimensions were chosen to be _very_ -small--in the neighborhood of 8--intentionally to showcase what happens when -at least one of the matrices is abnormally "skinny." Typically, organizations -and individuals only publish performance with square matrices, which can miss -the problem sizes of interest to many applications. Here, in addition to square -matrices (shown in the seventh column), we also show six other scenarios where -one or two `gemm` dimensions (of _m,_ _n_, and _k_) is small. +The problem size range, represented on the x-axis, is sampled in +increments that vary. These increments (and the overall range) are generally +large for the cases where two dimensions are small (and constant), medium for +cases where one dimension is small (and constant), and small for cases where +all dimensions (e.g. _m_, _n_, and _k_) are variable and bound to the problem +size (i.e., square matrices). The legend in each graph contains two entries for BLIS, corresponding to the two black lines, one solid and one dotted. The dotted line, **"BLIS conv"**, represents the conventional implementation that targets large matrices. This was the only implementation available in BLIS prior to the addition to the small/skinny matrix support. The solid line, **"BLIS sup"**, makes use of the -new small/skinny matrix implementation for certain small problems. Whenever -these results differ by any significant amount (beyond noise), it denotes a -problem size for which BLIS employed the new small/skinny implementation. -Put another way, **the delta between these two lines represents the performance -improvement between BLIS's previous status quo and the new regime.** +new small/skinny matrix implementation. Sometimes, the performance of +**"BLIS sup"** drops below that of **"BLIS conv"** for somewhat larger problems. +However, in practice, we use a threshold to determine when to switch from the +former to the latter, and therefore the goal is for the performance of +**"BLIS conv"** to serve as an approximate floor below which BLIS performance +never drops. Finally, each point along each curve represents the best of three trials. @@ -119,7 +125,8 @@ and/or install some (or all) of the implementations shown (e.g. [BLASFEO](https://github.com/giaf/blasfeo), and [libxsmm](https://github.com/hfp/libxsmm)), including BLIS. Be sure to consult the detailed notes provided below; they should be *very* helpful in successfully -building the libraries. The `runme.sh` script in `test/sup` will help you run +building the libraries. The `runme.sh` script in `test/sup` (or `test/supmt`) +will help you run some (or all) of the test drivers produced by the `Makefile`, and the Matlab/Octave function `plot_panel_trxsh()` defined in the `octave` directory will help you turn the output of those test drivers into a PDF file of graphs. @@ -140,20 +147,25 @@ The `runthese.m` file will contain example invocations of the function. * Max FMA vector IPC: 2 * Peak performance: * single-core: 57.6 GFLOPS (double-precision), 115.2 GFLOPS (single-precision) + * multicore: 57.6 GFLOPS/core (double-precision), 115.2 GFLOPS/core (single-precision) * Operating system: Gentoo Linux (Linux kernel 5.2.4) * Page size: 4096 bytes * Compiler: gcc 8.3.0 -* Results gathered: 23-28 August 2019 +* Results gathered: 3 March 2020 * Implementations tested: - * BLIS 4a0a6e8 (0.6.0-28) - * configured with `./configure --enable-cblas auto` + * BLIS 90db88e (0.6.1-8) + * configured with `./configure --enable-cblas auto` (single-threaded) + * configured with `./configure --enable-cblas -t openmp auto` (multithreaded) * sub-configuration exercised: `haswell` - * OpenBLAS 0.3.7 - * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) - * BLASFEO 01f6b7f + * Multithreaded (4 cores) execution requested via `export BLIS_NUM_THREADS=4` + * OpenBLAS 0.3.8 + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded) + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=4` (multithreaded) + * Multithreaded (4 cores) execution requested via `export OPENBLAS_NUM_THREADS=4` + * BLASFEO f9b78c6 * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. * Eigen 3.3.90 - * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596) * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): ``` # These lines added after line 67. @@ -165,18 +177,20 @@ The `runthese.m` file will contain example invocations of the function. * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. - * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) - * MKL 2019 update 4 - * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) - * libxsmm 77a295c (1.6.5-6679) + * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` + * Multithreaded (4 cores) execution requested via `export OMP_NUM_THREADS=4` + * MKL 2020 initial release + * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1` + * Multithreaded (4 cores) execution requested via `export MKL_NUM_THREADS=4` + * libxsmm a40a833 (post-1.14) * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. * Affinity: - * N/A. + * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-3"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset. * Frequency throttling (via `cpupower`): * Driver: intel_pstate * Governor: performance * Hardware limits: 800MHz - 3.8GHz - * Adjusted minimum: 3.7GHz + * Adjusted minimum: 3.8GHz * Comments: * libxsmm is highly competitive for very small problems, but quickly gives up once the "large" dimension exceeds about 180-240 (or 64 in the case where all operands are square). Also, libxsmm's `gemm` cannot handle a transposition on matrix A and similarly dispatches the fallback implementation for those cases. libxsmm also does not export CBLAS interfaces, and therefore only appears on the graphs for column-stored matrices. @@ -184,15 +198,21 @@ The `runthese.m` file will contain example invocations of the function. #### pdf -* [Kaby Lake row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf) -* [Kaby Lake column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf) +* [Kaby Lake single-threaded row-stored](graphs/sup/dgemm_rrr_kbl_nt1.pdf) +* [Kaby Lake single-threaded column-stored](graphs/sup/dgemm_ccc_kbl_nt1.pdf) +* [Kaby Lake multithreaded (4 cores) row-stored](graphs/sup/dgemm_rrr_kbl_nt4.pdf) +* [Kaby Lake multithreaded (4 cores) column-stored](graphs/sup/dgemm_ccc_kbl_nt4.pdf) #### png (inline) -* **Kaby Lake row-stored** -![row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png) -* **Kaby Lake column-stored** -![column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png) +* **Kaby Lake single-threaded row-stored** +![single-threaded row-stored](graphs/sup/dgemm_rrr_kbl_nt1.png) +* **Kaby Lake single-threaded column-stored** +![single-threaded column-stored](graphs/sup/dgemm_ccc_kbl_nt1.png) +* **Kaby Lake multithreaded (4 cores) row-stored** +![multithreaded row-stored](graphs/sup/dgemm_rrr_kbl_nt4.png) +* **Kaby Lake multithreaded (4 cores) column-stored** +![multithreaded column-stored](graphs/sup/dgemm_ccc_kbl_nt4.png) --- @@ -209,20 +229,25 @@ The `runthese.m` file will contain example invocations of the function. * Max FMA vector IPC: 2 * Peak performance: * single-core: 56 GFLOPS (double-precision), 112 GFLOPS (single-precision) + * multicore: 49.6 GFLOPS/core (double-precision), 99.2 GFLOPS/core (single-precision) * Operating system: Cray Linux Environment 6 (Linux kernel 4.4.103) * Page size: 4096 bytes * Compiler: gcc 7.3.0 -* Results gathered: 23-28 August 2019 +* Results gathered: 3 March 2020 * Implementations tested: - * BLIS 4a0a6e8 (0.6.0-28) - * configured with `./configure --enable-cblas auto` + * BLIS 90db88e (0.6.1-8) + * configured with `./configure --enable-cblas auto` (single-threaded) + * configured with `./configure --enable-cblas -t openmp auto` (multithreaded) * sub-configuration exercised: `haswell` - * OpenBLAS 0.3.7 - * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) - * BLASFEO 01f6b7f + * Multithreaded (12 cores) execution requested via `export BLIS_NUM_THREADS=12` + * OpenBLAS 0.3.8 + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded) + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=12` (multithreaded) + * Multithreaded (12 cores) execution requested via `export OPENBLAS_NUM_THREADS=12` + * BLASFEO f9b78c6 * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. * Eigen 3.3.90 - * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596) * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): ``` # These lines added after line 67. @@ -234,13 +259,15 @@ The `runthese.m` file will contain example invocations of the function. * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. - * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) - * MKL 2019 update 4 - * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) - * libxsmm 77a295c (1.6.5-6679) + * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` + * Multithreaded (12 cores) execution requested via `export OMP_NUM_THREADS=12` + * MKL 2020 initial release + * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1` + * Multithreaded (12 cores) execution requested via `export MKL_NUM_THREADS=12` + * libxsmm a40a833 (post-1.14) * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. * Affinity: - * N/A. + * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-11"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset. * Frequency throttling (via `cpupower`): * No changes made. * Comments: @@ -250,15 +277,21 @@ The `runthese.m` file will contain example invocations of the function. #### pdf -* [Haswell row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf) -* [Haswell column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf) +* [Haswell single-threaded row-stored](graphs/sup/dgemm_rrr_has_nt1.pdf) +* [Haswell single-threaded column-stored](graphs/sup/dgemm_ccc_has_nt1.pdf) +* [Haswell multithreaded (12 cores) row-stored](graphs/sup/dgemm_rrr_has_nt12.pdf) +* [Haswell multithreaded (12 cores) column-stored](graphs/sup/dgemm_ccc_has_nt12.pdf) #### png (inline) -* **Haswell row-stored** -![row-stored](graphs/sup/dgemm_rrr_has_nt1.png) -* **Haswell column-stored** -![column-stored](graphs/sup/dgemm_ccc_has_nt1.png) +* **Haswell single-threaded row-stored** +![single-threaded row-stored](graphs/sup/dgemm_rrr_has_nt1.png) +* **Haswell single-threaded column-stored** +![single-threaded column-stored](graphs/sup/dgemm_ccc_has_nt1.png) +* **Haswell multithreaded (12 cores) row-stored** +![multithreaded row-stored](graphs/sup/dgemm_rrr_has_nt12.png) +* **Haswell multithreaded (12 cores) column-stored** +![multithreaded column-stored](graphs/sup/dgemm_ccc_has_nt12.png) --- @@ -276,20 +309,26 @@ The `runthese.m` file will contain example invocations of the function. * Alternatively, FMA vector IPC is 2 when vectors are limited to 128 bits each. * Peak performance: * single-core: 24 GFLOPS (double-precision), 48 GFLOPS (single-precision) + * multicore: 20.4 GFLOPS/core (double-precision), 40.8 GFLOPS/core (single-precision) * Operating system: Ubuntu 18.04 (Linux kernel 4.15.0) * Page size: 4096 bytes * Compiler: gcc 7.4.0 -* Results gathered: 23-28 August 2019 +* Results gathered: 3 March 2020 * Implementations tested: - * BLIS 4a0a6e8 (0.6.0-28) - * configured with `./configure --enable-cblas auto` - * sub-configuration exercised: `zen` - * OpenBLAS 0.3.7 - * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0` (single-threaded) - * BLASFEO 01f6b7f + * BLIS 90db88e (0.6.1-8) + * configured with `./configure --enable-cblas auto` (single-threaded) + * configured with `./configure --enable-cblas -t openmp auto` (multithreaded) + * sub-configuration exercised: `haswell` + * Multithreaded (32 cores) execution requested via `export BLIS_NUM_THREADS=32` + * OpenBLAS 0.3.8 + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=0 USE_LOCKING=1` (single-threaded) + * configured `Makefile.rule` with `BINARY=64 NO_LAPACK=1 NO_LAPACKE=1 USE_THREAD=1 NUM_THREADS=32` (multithreaded) + * Multithreaded (32 cores) execution requested via `export OPENBLAS_NUM_THREADS=32` + * BLASFEO f9b78c6 * configured `Makefile.rule` with: `BLAS_API=1 FORTRAN_BLAS_API=1 CBLAS_API=1`. + * built BLAS library via `make CC=gcc` * Eigen 3.3.90 - * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (28 August 2019) + * Obtained via the [Eigen git mirror](https://github.com/eigenteam/eigen-git-mirror) (36b9596) * Prior to compilation, modified top-level `CMakeLists.txt` to ensure that `-march=native` was added to `CXX_FLAGS` variable (h/t Sameer Agarwal): ``` # These lines added after line 67. @@ -301,13 +340,15 @@ The `runthese.m` file will contain example invocations of the function. * configured and built BLAS library via `mkdir build; cd build; CC=gcc cmake ..; make blas` * installed headers via `cmake . -DCMAKE_INSTALL_PREFIX=$HOME/flame/eigen; make install` * The `gemm` implementation was pulled in at compile-time via Eigen headers; other operations were linked to Eigen's BLAS library. - * Requested threading via `export OMP_NUM_THREADS=1` (single-threaded) - * MKL 2019 update 4 - * Requested threading via `export MKL_NUM_THREADS=1` (single-threaded) - * libxsmm 77a295c (1.6.5-6679) + * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1` + * Multithreaded (32 cores) execution requested via `export OMP_NUM_THREADS=32` + * MKL 2020 initial release + * Single-threaded (1 core) execution requested via `export MKL_NUM_THREADS=1` + * Multithreaded (32 cores) execution requested via `export MKL_NUM_THREADS=32` + * libxsmm a40a833 (post-1.14) * compiled with `make AVX=2`; linked with [netlib BLAS](http://www.netlib.org/blas/) 3.6.0 as the fallback library to better show where libxsmm stops handling the computation internally. * Affinity: - * N/A. + * Thread affinity for BLIS was specified manually via `GOMP_CPU_AFFINITY="0-31"`. However, multithreaded OpenBLAS appears to revert to single-threaded execution if `GOMP_CPU_AFFINITY` is set. Therefore, when measuring OpenBLAS performance, the `GOMP_CPU_AFFINITY` environment variable was unset. * Frequency throttling (via `cpupower`): * Driver: acpi-cpufreq * Governor: performance @@ -320,15 +361,21 @@ The `runthese.m` file will contain example invocations of the function. #### pdf -* [Epyc row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf) -* [Epyc column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf) +* [Epyc single-threaded row-stored](graphs/sup/dgemm_rrr_epyc_nt1.pdf) +* [Epyc single-threaded column-stored](graphs/sup/dgemm_ccc_epyc_nt1.pdf) +* [Epyc multithreaded (32 cores) row-stored](graphs/sup/dgemm_rrr_epyc_nt32.pdf) +* [Epyc multithreaded (32 cores) column-stored](graphs/sup/dgemm_ccc_epyc_nt32.pdf) #### png (inline) -* **Epyc row-stored** -![row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png) -* **Epyc column-stored** -![column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png) +* **Epyc single-threaded row-stored** +![single-threaded row-stored](graphs/sup/dgemm_rrr_epyc_nt1.png) +* **Epyc single-threaded column-stored** +![single-threaded column-stored](graphs/sup/dgemm_ccc_epyc_nt1.png) +* **Epyc multithreaded (32 cores) row-stored** +![multithreaded row-stored](graphs/sup/dgemm_rrr_epyc_nt32.png) +* **Epyc multithreaded (32 cores) column-stored** +![multithreaded column-stored](graphs/sup/dgemm_ccc_epyc_nt32.png) --- diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index d1a6baece..e04e97b20 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -4,6 +4,8 @@ ## Contents +* [Changes in 0.7.0](ReleaseNotes.md#changes-in-070) +* [Changes in 0.6.1](ReleaseNotes.md#changes-in-061) * [Changes in 0.6.0](ReleaseNotes.md#changes-in-060) * [Changes in 0.5.2](ReleaseNotes.md#changes-in-052) * [Changes in 0.5.1](ReleaseNotes.md#changes-in-051) @@ -35,6 +37,90 @@ * [Changes in 0.0.2](ReleaseNotes.md#changes-in-002) * [Changes in 0.0.1](ReleaseNotes.md#changes-in-001) +## Changes in 0.7.0 +April 7, 2020 + +Improvements present in 0.7.0: + +Framework: +- Implemented support for multithreading within the sup (skinny/small/unpacked) framework, which previously was single-threaded only. Note that this feature works harmoniously with the selective packing introduced into the sup framework in 0.6.1. (AMD) +- Renamed `bli_thread_obarrier()` and `bli_thread_obroadcast()` functions to drop the 'o', which was left over from when `thrcomm_t` objects tracked both "inner" and "outer" communicators. +- Fixed an obscure `int`-to-`packbuf_t` type conversion error that only affects certain C++ compilers (including g++) when compiling application code that includes the BLIS header file `blis.h`. (Ajay Panyala) +- Added a missing early `return` statement in `bli_thread_partition_2x2()`, which provides a slight optimization. (Kiran Varaganti) + +Kernels: +- Fixed the semantics of the `bli_amaxv()` kernels ('s' and 'd') within the `zen` kernel set. Previously, the kernels (incorrectly) returned the index of the last element whose absolute value was largest (in the event there were multiple of equal value); now, it (correclty) returns the index of the first of such elements. The kernels also now return the index of the first NaN, if one is encountered. (Mat Cross, Devin Matthews) + +Build system: +- Warn the user at configure-time when hardware auto-detection returns the `generic` subconfiguration since this is probably not what they were expecting. (Devin Matthews) +- Removed unnecessary sorting (and duplicate removal) on `LDFLAGS` in `common.mk`. (Isuru Fernando) +- Specify the full path to the location of the dynamic library on OSX so that other dynamic libraries that depend on BLIS know where to find the library. (Satish Balay, Jed Brown) + +Testing: +- Updated and reorganized test drivers in `test/sup` so that they work for either single-threaded or multithreaded purposes. (AMD) +- Updated/optimized octave scripts in `test/sup` for use with octave 5.2.0. +- Minor updates/tweaks to `test/1m4m`. + +Documentation: +- Updated existing single-threaded sup performance graphs with new data and added multithreaded sup graphs to `docs/PerformanceSmall.md`. +- Added mention of Gentoo support under the external packages section of the `README.md`. +- Tweaks to `docs/Multithreading.md` that clarify that setting any `BLIS_*_NT` variable to 1 will be considered manual specification for the purposes of determining whether to auto-factorize via `BLIS_NUM_THREADS`. (AMD) + +## Changes in 0.6.1 +January 14, 2020 + +Improvements present in 0.6.1: + +Framework: +- Added support for pre-broadcast when packing B. This causes elements of B to be repeated (broadcast) in the packed copy of B so that subsequent vector loads will result in the element already being pre-broadcast into the vector register. +- Added support for selective packing to `gemmsup` (controlled via environment variables and/or the `rntm_t` object). (AMD) +- Fixed a bug in `sdsdot_sub()` that redundantly added the "alpha" scalar and a separate bug in the order of typecasting intermediate products in `sdsdot_()`. (Simon Lukas Märtens, Devin Matthews) +- Fixed an obscure bug in `bli_acquire_mpart_mdim()`/`bli_acquire_mpart_ndim()`. (Minh Quan Ho) +- Fixed a subtle and complicated bug that only manifested via the BLAS test drivers in the `generic` subconfiguration, and possibly any other subconfiguration that did not register complex-domain `gemm` ukernels, or registered ONLY real-domain ukernels as row-preferential. (Dave Love) +- Always use `sumsqv` to compute `normfv` instead of the "dot product trick" that was previously employed for performance reasons. (Roman Yurchak, Devin Matthews, and Isuru Fernando) +- Fixed bug in `thrinfo_t` debugging/printing code. + +Kernels: +- Implemented and registered an optimized `dgemm` microkernel for the `power9` kernel set. (Nicholai Tukanov) +- Pacify a `restrict` warning in the `gemmtrsm4m1` reference ukernel. (Dave Love, Devin Matthews) + +Build system: +- Fixed parsing in `vpu_count()` on some SkylakeX workstations. (Dave Love) +- Reimplemented `bli_cpuid_query()` for ARM to use `stdio`-based functions instead of `popen()`. (Dave Love) +- Use `-march=znver1` for clang on `zen2` subconfig. +- Updated `-march` flags for `sandybridge`, `haswell` subconfigurations to use newer syntax (e.g. `haswell` instead of `core-avx2` and `sandybridge` instead of `corei7-avx`. +- Correctly use `-qopenmp-simd` for reference kernels when compiling with icc. (Victor Eikjhout) +- Added `-march` support for select gcc version ranges where flag syntax changes or new flags are added. The ranges we identify are: versions older than 4.9.0; versions older than 6.1.0 (but newer than 4.9.0); versions older than 9.1.0 (but newer than 6.1.0). +- Use `-funsafe-math-optimizations` and `-ffp-contract=fast` for all reference kernels when using gcc or clang. +- Updated MC cache blocksizes used by `haswell` subconfig. +- Updated NC cache blocksizes used by `zen` subconfig. +- Fixed a typo in the context registration of the `cortexa53` subconfiguration in `bli_gks.c`. (Francisco Igual) +- Output a more informative error when the user manually targets a subconfiguration that configure places in the configuration blacklist. (Tze Meng Low) +- Set execute bits of shared library at install-time. (Adam J. Stewart) +- Added missing thread-related symbols for export to shared libraries. (Kyungmin Lee) +- Removed (finally) the `attic/windows` directory since we offer Windows DLL support via AppVeyor's build artifacts, and thus that directory was only likely confusing people. + +Testing: +- Fixed latent testsuite microkernel module bug for `power9` subconfig. (Jeff Hammond) +- Added `test/1m4m` driver directory for test drivers related to the 1m paper. +- Added libxsmm support to `test/sup drivers`. (Robert van de Geijn) +- Updated `.travis.yml` and `do_sde.sh` to automatically accept SDE license and download SDE directly from Intel. (Devin Matthews, Jeff Hammond) +- Updated standalone test drivers to iterate backwards through the specified problem space. This often helps avoid the situation whereby the CPU doesn't immediately throttle up to its maximum clock frequency, which can produce strange discontinuities (sharply rising "cliffs") in performance graphs. +- Pacify an unused variable warning in `blastest/f2c/lread.c`. (Jeff Hammond) +- Various other minor fixes/tweaks to test drivers. + +Documentation: +- Added libxsmm results to `docs/PerformanceSmall.md`. +- Added BLASFEO results to `docs/PerformanceSmall.md`. +- Added the page size and location of the performance drivers to `docs/Performance.md` and `docs/PerformanceSmall.md`. (Dave Love) +- Added notes to `docs/Multithreading.md` regarding the nuances of setting multithreading parameters the manual way vs. the automatic way. (Jérémie du Boisberranger) +- Added a section on reproduction to `docs/Performance.md` and `docs/PerformanceSmall.md`. (Dave Love) +- Documented Eigen `-march=native` hack in `docs/Performance.md` and `docs/PerformanceSmall.md`. (Sameer Agarwal) +- Inserted multithreading links and disclaimers to `BuildSystem.md`. (Jeff Diamond) +- Fixed typo in description for `bli_?axpy2v()` in `docs/BLISTypedAPI.md`. (Shmuel Levine) +- Added "How to Download BLIS" section to `README.md`. (Jeff Diamond) +- Various other minor documentation fixes. + ## Changes in 0.6.0 June 3, 2019 diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf index 1d272c7b4..eafba82d4 100644 Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png index 200a29426..ceeb08426 100644 Binary files a/docs/graphs/sup/dgemm_ccc_epyc_nt1.png and b/docs/graphs/sup/dgemm_ccc_epyc_nt1.png differ diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt32.pdf b/docs/graphs/sup/dgemm_ccc_epyc_nt32.pdf new file mode 100644 index 000000000..f2137eaba Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_epyc_nt32.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_epyc_nt32.png b/docs/graphs/sup/dgemm_ccc_epyc_nt32.png new file mode 100644 index 000000000..09958337f Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_epyc_nt32.png differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.pdf b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf index 2614c65a5..75a7502ab 100644 Binary files a/docs/graphs/sup/dgemm_ccc_has_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_has_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt1.png b/docs/graphs/sup/dgemm_ccc_has_nt1.png index 34ea1eee4..527cca0c7 100644 Binary files a/docs/graphs/sup/dgemm_ccc_has_nt1.png and b/docs/graphs/sup/dgemm_ccc_has_nt1.png differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt12.pdf b/docs/graphs/sup/dgemm_ccc_has_nt12.pdf new file mode 100644 index 000000000..b598c83f9 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt12.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_has_nt12.png b/docs/graphs/sup/dgemm_ccc_has_nt12.png new file mode 100644 index 000000000..e50a72753 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_has_nt12.png differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf index ea39829f9..f30c4fac9 100644 Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png index 6cf1c58de..fc86fb0c2 100644 Binary files a/docs/graphs/sup/dgemm_ccc_kbl_nt1.png and b/docs/graphs/sup/dgemm_ccc_kbl_nt1.png differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt4.pdf b/docs/graphs/sup/dgemm_ccc_kbl_nt4.pdf new file mode 100644 index 000000000..d022b7076 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_kbl_nt4.pdf differ diff --git a/docs/graphs/sup/dgemm_ccc_kbl_nt4.png b/docs/graphs/sup/dgemm_ccc_kbl_nt4.png new file mode 100644 index 000000000..3adefb653 Binary files /dev/null and b/docs/graphs/sup/dgemm_ccc_kbl_nt4.png differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf index ff7ea7055..896844d3c 100644 Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png index 2a7b7a397..ada4c1769 100644 Binary files a/docs/graphs/sup/dgemm_rrr_epyc_nt1.png and b/docs/graphs/sup/dgemm_rrr_epyc_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt32.pdf b/docs/graphs/sup/dgemm_rrr_epyc_nt32.pdf new file mode 100644 index 000000000..75f8eb62a Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_epyc_nt32.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_epyc_nt32.png b/docs/graphs/sup/dgemm_rrr_epyc_nt32.png new file mode 100644 index 000000000..7607c91cb Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_epyc_nt32.png differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.pdf b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf index e1dc609a0..eff479579 100644 Binary files a/docs/graphs/sup/dgemm_rrr_has_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_has_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt1.png b/docs/graphs/sup/dgemm_rrr_has_nt1.png index c8b47b85a..084369e0e 100644 Binary files a/docs/graphs/sup/dgemm_rrr_has_nt1.png and b/docs/graphs/sup/dgemm_rrr_has_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt12.pdf b/docs/graphs/sup/dgemm_rrr_has_nt12.pdf new file mode 100644 index 000000000..bd9ad99b2 Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt12.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_has_nt12.png b/docs/graphs/sup/dgemm_rrr_has_nt12.png new file mode 100644 index 000000000..a404a2eda Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_has_nt12.png differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf index 5715c130a..ba3c87d88 100644 Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png index cd781c407..96386bf80 100644 Binary files a/docs/graphs/sup/dgemm_rrr_kbl_nt1.png and b/docs/graphs/sup/dgemm_rrr_kbl_nt1.png differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt4.pdf b/docs/graphs/sup/dgemm_rrr_kbl_nt4.pdf new file mode 100644 index 000000000..2fe3fddf0 Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_kbl_nt4.pdf differ diff --git a/docs/graphs/sup/dgemm_rrr_kbl_nt4.png b/docs/graphs/sup/dgemm_rrr_kbl_nt4.png new file mode 100644 index 000000000..535ce244d Binary files /dev/null and b/docs/graphs/sup/dgemm_rrr_kbl_nt4.png differ diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index 0c19829f2..46fd42d86 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -667,7 +667,7 @@ if ( col_stored ) { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -678,7 +678,7 @@ bli_thread_obarrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ } \ else { \ if ( bli_thread_work_id( thread ) == 0 ) \ @@ -691,7 +691,7 @@ else { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -702,7 +702,7 @@ bli_thread_obarrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ } \ */ /* diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c index f4c8ab82d..550a8fb87 100644 --- a/frame/1m/unpackm/bli_unpackm_int.c +++ b/frame/1m/unpackm/bli_unpackm_int.c @@ -73,6 +73,6 @@ void bli_unpackm_int } // Barrier so that unpacking is done before computation. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); } diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index b5dc17d4a..d6efb593c 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -50,7 +50,7 @@ void bli_l3_packm siz_t size_needed; // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); // Every thread initializes x_pack and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t @@ -102,7 +102,7 @@ void bli_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the contents of the chief thread's local mem_t entry to the // mem_t field in this thread's control tree node. @@ -146,7 +146,7 @@ void bli_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. @@ -159,7 +159,7 @@ void bli_l3_packm // will already have the cached values in their local control // trees' mem_t entries, currently pointed to by cntl_mem_p. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); } } @@ -182,6 +182,6 @@ void bli_l3_packm ); // Barrier so that packing is done before computation. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); } diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c index 9cab536c6..7b1213299 100644 --- a/frame/3/bli_l3_sup_packm_a.c +++ b/frame/3/bli_l3_sup_packm_a.c @@ -67,7 +67,7 @@ void PASTEMAC(ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_obarrier( thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ @@ -97,7 +97,7 @@ void PASTEMAC(ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -146,7 +146,7 @@ void PASTEMAC(ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -422,7 +422,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_obarrier( thread ); \ + bli_thread_barrier( thread ); \ } \ } diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c index 41505ec45..37fde1466 100644 --- a/frame/3/bli_l3_sup_packm_b.c +++ b/frame/3/bli_l3_sup_packm_b.c @@ -67,7 +67,7 @@ void PASTEMAC(ch,opname) \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ - bli_thread_obarrier( thread ); \ + bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ @@ -97,7 +97,7 @@ void PASTEMAC(ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -146,7 +146,7 @@ void PASTEMAC(ch,opname) \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ - mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \ + mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The @@ -422,7 +422,7 @@ void PASTEMAC(ch,opname) \ } \ \ /* Barrier so that packing is done before computation. */ \ - bli_thread_obarrier( thread ); \ + bli_thread_barrier( thread ); \ } \ } diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index b3593b839..6699b458f 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -237,7 +237,7 @@ if ( col_stored ) { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -248,7 +248,7 @@ bli_thread_obarrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ } \ else { \ if ( bli_thread_work_id( thread ) == 0 ) \ @@ -261,7 +261,7 @@ else { \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ @@ -272,7 +272,7 @@ bli_thread_obarrier( thread ); \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ -bli_thread_obarrier( thread ); \ +bli_thread_barrier( thread ); \ } \ */ /* diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c index ef0a0d987..5dfe4fb92 100644 --- a/frame/3/bli_l3_sup_var1n2m.c +++ b/frame/3/bli_l3_sup_var1n2m.c @@ -674,7 +674,7 @@ void PASTEMAC(ch,varname) \ \ /* NOTE: This barrier is only needed if we are packing A (since that matrix is packed within the pc loop of this variant). */ \ - if ( packa ) bli_thread_obarrier( thread_pa ); \ + if ( packa ) bli_thread_barrier( thread_pa ); \ } \ } \ \ @@ -1292,7 +1292,7 @@ void PASTEMAC(ch,varname) \ \ /* NOTE: This barrier is only needed if we are packing B (since that matrix is packed within the pc loop of this variant). */ \ - if ( packb ) bli_thread_obarrier( thread_pb ); \ + if ( packb ) bli_thread_barrier( thread_pb ); \ } \ } \ \ diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c index 0c236f6d1..94f0af409 100644 --- a/frame/3/gemm/bli_gemm_blk_var3.c +++ b/frame/3/gemm/bli_gemm_blk_var3.c @@ -84,7 +84,7 @@ void bli_gemm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 5128bea69..d6633c6d6 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -66,7 +66,7 @@ void bli_gemm_int { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); return; } @@ -80,7 +80,7 @@ void bli_gemm_int if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); return; } diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c index 11d35d88b..2554197f7 100644 --- a/frame/3/trsm/bli_trsm_blk_var1.c +++ b/frame/3/trsm/bli_trsm_blk_var1.c @@ -123,7 +123,7 @@ void bli_trsm_blk_var1 // We must execute a barrier here because the upcoming rank-k update // requires the packed matrix B to be fully updated by the trsm // subproblem. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); // Isolate the remaining part of the column panel matrix A, which we do by // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c index ee7c2f9ac..a68cc853b 100644 --- a/frame/3/trsm/bli_trsm_blk_var3.c +++ b/frame/3/trsm/bli_trsm_blk_var3.c @@ -85,7 +85,7 @@ void bli_trsm_blk_var3 ); //bli_thread_ibarrier( thread ); - bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure diff --git a/frame/3/trsm/bli_trsm_int.c b/frame/3/trsm/bli_trsm_int.c index dc39e69e0..53a22c355 100644 --- a/frame/3/trsm/bli_trsm_int.c +++ b/frame/3/trsm/bli_trsm_int.c @@ -68,7 +68,7 @@ void bli_trsm_int { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); return; } @@ -119,7 +119,7 @@ void bli_trsm_int } // FGVZ->TMS: Is this barrier still needed? - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 4f1f9fb93..06b23ed1a 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -74,6 +74,12 @@ void bli_arch_set_id_once( void ) void bli_arch_set_id( void ) { + // NOTE: Change this usage of getenv() to bli_env_get_var() after + // merging #351. + //bool_t do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 ); + bool_t do_logging = getenv( "BLIS_ARCH_DEBUG" ) != NULL; + bli_arch_set_logging( do_logging ); + // Architecture families. #if defined BLIS_FAMILY_INTEL64 || \ defined BLIS_FAMILY_AMD64 || \ @@ -156,6 +162,10 @@ void bli_arch_set_id( void ) id = BLIS_ARCH_GENERIC; #endif + if ( bli_arch_get_logging() ) + fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n", + bli_arch_string( id ) ); + //printf( "blis_arch_query_id(): id = %u\n", id ); //exit(1); } @@ -200,3 +210,37 @@ char* bli_arch_string( arch_t id ) return config_name[ id ]; } +// ----------------------------------------------------------------------------- + +static bool_t arch_dolog = 0; + +void bli_arch_set_logging( bool_t dolog ) +{ + arch_dolog = dolog; +} + +bool_t bli_arch_get_logging( void ) +{ + return arch_dolog; +} + +void bli_arch_log( char* fmt, ... ) +{ + char prefix[] = "libblis: "; + int n_chars = strlen( prefix ) + strlen( fmt ) + 1; + + if ( bli_arch_get_logging() && fmt ) + { + char* prefix_fmt = malloc( n_chars ); + + snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt ); + + va_list ap; + va_start( ap, fmt ); + vfprintf( stderr, prefix_fmt, ap ); + va_end( ap ); + + free( prefix_fmt ); + } +} + diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h index 6b8a38ebd..4f3f94a7e 100644 --- a/frame/base/bli_arch.h +++ b/frame/base/bli_arch.h @@ -42,6 +42,9 @@ void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); +void bli_arch_set_logging( bool_t dolog ); +bool_t bli_arch_get_logging( void ); +void bli_arch_log( char*, ... ); #endif diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index 690e22304..5858c88a7 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -6,6 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2019, Advanced Micro Devices, Inc. + Copyright (C) 2019, Dave Love, University of Manchester Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -52,6 +53,7 @@ #include "bli_cpuid.h" #else #include "blis.h" + #include "bli_arch.h" #endif // ----------------------------------------------------------------------------- @@ -167,7 +169,22 @@ bool_t bli_cpuid_is_skx int nvpu = vpu_count(); - if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) + if ( bli_cpuid_has_features( features, expected ) ) + { + switch ( nvpu ) + { + case 1: + bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" ); + return FALSE; + case 2: + bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" ); + return TRUE; + default: + bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" ); + return FALSE; + } + } + else return FALSE; return TRUE; @@ -893,6 +910,10 @@ void get_cpu_name( char *cpu_name ) *( uint32_t* )&cpu_name[32+12] = edx; } +// Return the number of FMA units _assuming avx512 is supported_. +// This needs updating for new processor types, sigh. +// See https://ark.intel.com/content/www/us/en/ark.html#@Processors +// and also https://github.com/jeffhammond/vpu-count int vpu_count( void ) { char cpu_name[48] = {}; @@ -904,49 +925,59 @@ int vpu_count( void ) if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL ) { - loc = strstr( cpu_name, "Platinum" ); + if (( loc = strstr( cpu_name, "Platinum" ) )) + return 2; if ( loc == NULL ) - loc = strstr( cpu_name, "Gold" ); + loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below if ( loc == NULL ) - loc = strstr( cpu_name, "Silver" ); + if (( loc = strstr( cpu_name, "Silver" ) )) + return 1; if ( loc == NULL ) - loc = strstr( cpu_name, "Bronze" ); + if (( loc = strstr( cpu_name, "Bronze" ) )) + return 1; if ( loc == NULL ) loc = strstr( cpu_name, "W" ); + if ( loc == NULL ) + if (( loc = strstr( cpu_name, "D" ) )) + // Fixme: May be wrong + // + return 1; if ( loc == NULL ) return -1; - loc = strstr( loc+1, " " ); + // We may have W-nnnn rather than, say, Gold nnnn + if ( 'W' == *loc && '-' == *(loc+1) ) + loc++; + else + loc = strstr( loc+1, " " ); if ( loc == NULL ) return -1; strncpy( model_num, loc+1, 4 ); - model_num[4] = '\0'; + model_num[4] = '\0'; // Things like i9-10900X matched above sku = atoi( model_num ); + // These were derived from ARK listings as of 2019-10-09, but + // may not be complete, especially as the ARK Skylake listing + // seems to be limited. if ( 8199 >= sku && sku >= 8100 ) return 2; else if ( 6199 >= sku && sku >= 6100 ) return 2; else if ( sku == 5122 ) return 2; + else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold + else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold else if ( 5199 >= sku && sku >= 5100 ) return 1; else if ( 4199 >= sku && sku >= 4100 ) return 1; else if ( 3199 >= sku && sku >= 3100 ) return 1; + else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W + else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W else if ( 2199 >= sku && sku >= 2120 ) return 2; + else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions else if ( 2119 >= sku && sku >= 2100 ) return 1; else return -1; } - else if ( strstr( cpu_name, "Intel(R) Core(TM) i9" ) != NULL ) - { - return 1; - } - else if ( strstr( cpu_name, "Intel(R) Core(TM) i7" ) != NULL ) - { - if ( strstr( cpu_name, "7800X" ) != NULL || - strstr( cpu_name, "7820X" ) != NULL ) - return 1; - else - return -1; - } + else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL ) + return 2; // All i7/i9 with avx512? else { return -1; @@ -1082,3 +1113,4 @@ char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath } #endif + diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index 88d7aaa82..9edaf47ae 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -126,8 +126,8 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want ) #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 -// // for more information why this move was made. -// //#include "cpuid.h" +// for more information why this move was made. +//#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h index e6879393d..d7805ba99 100644 --- a/frame/base/bli_mem.h +++ b/frame/base/bli_mem.h @@ -153,7 +153,7 @@ static void bli_mem_clear( mem_t* mem ) //Pass actual type instead bli_mem_set_buf_type ( pb, mem ); #else - bli_mem_set_buf_type( -1, mem ); + bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c index eaf2022af..cfdff957d 100644 --- a/frame/compat/bla_dot.c +++ b/frame/compat/bla_dot.c @@ -264,10 +264,16 @@ float PASTEF77(sd,sdot) const float* y, const f77_int* incy ) { - float r = ( float )PASTEF77(d,sdot)( n, - x, incx, - y, incy ); - return r + *sb; + return ( float ) + ( + ( double )(*sb) + + PASTEF77(d,sdot) + ( + n, + x, incx, + y, incy + ) + ); } // Input vectors stored in single precision, computed in double precision, diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index e1a7321a4..24015074b 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -99,6 +99,7 @@ #include "bla_lsame.h" #include "bla_xerbla.h" +#include "bla_xerbla_array.h" // -- Level-0 BLAS prototypes -- diff --git a/frame/compat/cblas/f77_sub/f77_dot_sub.c b/frame/compat/cblas/f77_sub/f77_dot_sub.c index 6c06133f1..8667791fb 100644 --- a/frame/compat/cblas/f77_sub/f77_dot_sub.c +++ b/frame/compat/cblas/f77_sub/f77_dot_sub.c @@ -75,7 +75,7 @@ void PASTEF772(sds,dot,sub) float* rval ) { - *rval = *sb + PASTEF77(sds,dot) + *rval = PASTEF77(sds,dot) ( n, sb, diff --git a/frame/compat/f2c/bla_xerbla_array.c b/frame/compat/f2c/bla_xerbla_array.c new file mode 100644 index 000000000..722bb2914 --- /dev/null +++ b/frame/compat/f2c/bla_xerbla_array.c @@ -0,0 +1,74 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_BLAS + +#define MAX_NUM_CHARS 32 + +int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info) +{ + int i; +#if 1 + // 01234567890123456789012345678901 + char srname[ MAX_NUM_CHARS + 1 ] = " "; +#else + char srname[ MAX_NUM_CHARS + 1 ]; + + // Initialize srname to contain blank characters. + for ( i = 0; i < MAX_NUM_CHARS; ++i ) srname[i] = ' '; +#endif + + // Compute the number of chars to copy as the minimum of the length of + // srname_array and MAX_NUM_CHARS. + const int n_copy = bli_min( srname_len, MAX_NUM_CHARS ); + + // Copy over each element of srname_array. + for ( i = 0; i < n_copy; ++i ) + { + srname[i] = srname_array[i]; + } + + // NULL terminate. + srname[i] = '\0'; + + // Call xerbla_(). + PASTEF770(xerbla)( srname, info, ( ftnlen )srname_len ); + + return 0; +} + +#endif + diff --git a/frame/compat/f2c/bla_xerbla_array.h b/frame/compat/f2c/bla_xerbla_array.h new file mode 100644 index 000000000..6a4b4e059 --- /dev/null +++ b/frame/compat/f2c/bla_xerbla_array.h @@ -0,0 +1,39 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef BLIS_ENABLE_BLAS + +BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); + +#endif diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 1dd722ed3..45ba22c3c 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -265,6 +265,9 @@ CNTX_INIT_PROTS( generic ) // -- ARM architectures -- +#ifdef BLIS_KERNELS_ARMSVE +#include "bli_kernels_armsve.h" +#endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" #endif diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h index 9ec534d3b..ab06b7a35 100644 --- a/frame/thread/bli_thrcomm.h +++ b/frame/thread/bli_thrcomm.h @@ -56,8 +56,9 @@ thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); -void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); -void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); + +BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); +BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index b9e493473..687e56afc 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -1062,7 +1062,6 @@ void bli_thread_partition_2x2 { *nt1 = ( work1 >= work2 ? n_thread : 1 ); *nt2 = ( work1 < work2 ? n_thread : 1 ); - return; } diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index 4dc946eff..3cbd5a2d1 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -67,6 +67,7 @@ void bli_thread_finalize( void ); // Thread range-related prototypes. +BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c index 28266d837..50c9c9822 100644 --- a/frame/thread/bli_thrinfo.c +++ b/frame/thread/bli_thrinfo.c @@ -340,7 +340,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl // Broadcast the temporary array to all threads in the parent's // communicator. - new_comms = bli_thread_obroadcast( thread_par, new_comms ); + new_comms = bli_thread_broadcast( thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the @@ -348,7 +348,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -364,7 +364,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl NULL // sub_node ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. @@ -477,7 +477,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); // NOTE: Recall that parent_comm_id == child_comm_id, so checking for the // parent's chief-ness is equivalent to checking for chief-ness in the new @@ -488,7 +488,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode // Broadcast the new thrcomm_t address to the other threads in the // parent's group. - new_comm = bli_thread_obroadcast( thread_par, new_comm ); + new_comm = bli_thread_broadcast( thread_par, new_comm ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -504,7 +504,7 @@ thrinfo_t* bli_thrinfo_create_for_cntl_prenode NULL // sub_node ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); return thread_chl; } diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h index 33f0f4323..8d197f9ed 100644 --- a/frame/thread/bli_thrinfo.h +++ b/frame/thread/bli_thrinfo.h @@ -171,12 +171,12 @@ static void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) // other thrinfo_t-related functions -static void* bli_thread_obroadcast( thrinfo_t* t, void* p ) +static void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } -static void bli_thread_obarrier( thrinfo_t* t ) +static void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c index 416cd9799..e67e8b642 100644 --- a/frame/thread/bli_thrinfo_sup.c +++ b/frame/thread/bli_thrinfo_sup.c @@ -205,7 +205,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl // Broadcast the temporary array to all threads in the parent's // communicator. - new_comms = bli_thread_obroadcast( thread_par, new_comms ); + new_comms = bli_thread_broadcast( thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the @@ -213,7 +213,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. @@ -229,7 +229,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl NULL // sub_node ); - bli_thread_obarrier( thread_par ); + bli_thread_barrier( thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c new file mode 100644 index 000000000..82def6df7 --- /dev/null +++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c @@ -0,0 +1,235 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef __ARM_FEATURE_SVE +#include +#else +#error "No Arm SVE intrinsics support in compiler" +#endif // __ARM_FEATURE_SVE + +// assumption: +// SVE vector length = 256 bits. +// + +void bli_dpackm_armsve256_asm_8xk + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t n_, + dim_t n_max_, + void* restrict kappa_, + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_, + cntx_t* restrict cntx + ) +{ + double* a = ( double* )a_; + double* p = ( double* )p_; + double* kappa = ( double* )kappa_; + const int64_t cdim = cdim_; + const int64_t mnr = 8; + const int64_t n = n_; + const int64_t n_max = n_max_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + double* restrict alpha1 = a; + double* restrict alpha1_4 = alpha1 + 4 * inca; + double* restrict pi1 = p; + const svbool_t all_active = svptrue_b64(); + svfloat64_t z_a0; + svfloat64_t z_a4; + svuint64_t z_index; + + // creating index for gather/scatter + // with each element as: 0, 1*inca, 2*inca, 3*inca + z_index = svindex_u64( 0, inca * sizeof( double ) ); + + if ( cdim == mnr ) + { + if ( bli_deq1( *kappa ) ) + { + if ( inca == 1 ) // continous memory. packA style + { + for ( dim_t k = n; k != 0; --k ) + { + // svld1_f64 retrieves all zero's into z_a0 and z_a4, + // which is not correct. + // qemu-aarch64 or gcc interpretation of svld1_f64 + // should be blamed. + + // load 8 continuous elments from *a + // z_a0 = svld1_f64( all_active, alpha1 ); + // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); + + // as a workaround, using gather load + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + for ( dim_t k = n; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + } + else // *kappa != 1.0 + { + // load kappa into vector + svfloat64_t z_kappa; + + z_kappa = svdup_f64( *kappa ); + + if ( inca == 1 ) // continous memory. packA style + { + for ( dim_t k = n; k != 0; --k ) + { + // load 8 continuous elments from *a + // z_a0 = svld1_f64( all_active, alpha1 ); + // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); + // same reason as above. as a workaround, using gather load + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + for ( dim_t k = n; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + } // end of if ( *kappa == 1.0 ) + } + else // if ( cdim < mnr ) + { + bli_dscal2m_ex + ( + 0, + BLIS_NONUNIT_DIAG, + BLIS_DENSE, + ( trans_t )conja, + cdim, + n, + kappa, + a, inca, lda, + p, 1, ldp, + cntx, + NULL + ); + + // if ( cdim < mnr ) + { + const dim_t i = cdim; + const dim_t m_edge = mnr - i; + const dim_t n_edge = n_max; + double* restrict p_edge = p + (i )*1; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } + } + + if ( n < n_max ) + { + const dim_t j = n; + const dim_t m_edge = mnr; + const dim_t n_edge = n_max - j; + double* restrict p_edge = p + (j )*ldp; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } +} diff --git a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c new file mode 100644 index 000000000..01bb644b1 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c @@ -0,0 +1,809 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" + +/* + o 8x8 Double precision micro-kernel + o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC. + o Tested on qemu-aarch64 and armie for SVE. + + Preconditions: + - to use this kernel, SVE with vector length of 256 bits is a must. + + April 2020. +*/ +void bli_dgemm_armsve256_asm_8x8 + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +__asm__ volatile +( +" \n\t" +" ldr x0,%[aaddr] \n\t" // Load address of A +" ldr x1,%[baddr] \n\t" // Load address of B +" ldr x2,%[caddr] \n\t" // Load address of C +" \n\t" +" ldr x3,%[a_next] \n\t" // Move pointer +" ldr x4,%[b_next] \n\t" // Move pointer +" \n\t" +" ldr x5,%[k_iter] \n\t" // Init guard (k_iter) +" ldr x6,%[k_left] \n\t" // Init guard (k_iter) +" \n\t" +" ldr x7,%[alpha] \n\t" // Alpha address +" ldr x8,%[beta] \n\t" // Beta address +" \n\t" +" ldr x9,%[cs_c] \n\t" // Load cs_c +" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) +" \n\t" +" ldr x13,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). +" \n\t" +" add x20,x2,x10 \n\t" //Load address Column 1 of C +" add x21,x20,x10 \n\t" //Load address Column 2 of C +" add x22,x21,x10 \n\t" //Load address Column 3 of C +" add x23,x22,x10 \n\t" //Load address Column 4 of C +" add x24,x23,x10 \n\t" //Load address Column 5 of C +" add x25,x24,x10 \n\t" //Load address Column 6 of C +" add x26,x25,x10 \n\t" //Load address Column 7 of C +" \n\t" +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. +" \n\t" +" ldr z0, [x0] \n\t" // Load a +" ldr z1, [x0, #1, MUL VL] \n\t" +" \n\t" +" ptrue p0.d, all \n\t" +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // PRFM, the following prefetch on [x1] and [x0] +" \n\t" // is for b rows 4..7 and a columns 4..7. +" \n\t" // both of them will be used in next iteration +" \n\t" // of k_iter (unrolled per 4 loops) +" \n\t" +" dup z16.d, #0 \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" // prefetch b row no.4 +" dup z17.d, #0 \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #320] \n\t" // prefetch b row no.5 +" dup z18.d, #0 \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #384] \n\t" // prefetch b row no.6 +" dup z19.d, #0 \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #448] \n\t" // preftech b row no.7 +" dup z20.d, #0 \n\t" // Vector for accummulating column 2 +" dup z21.d, #0 \n\t" // Vector for accummulating column 2 +" \n\t" +" dup z22.d, #0 \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #256] \n\t" // prefetch a col. no.4 +" dup z23.d, #0 \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #320] \n\t" // prefetch a col. no.5 +" dup z24.d, #0 \n\t" // Vector for accummulating column 4 +" prfm PLDL1KEEP, [x0, #384] \n\t" // prefetch a col. no.6 +" dup z25.d, #0 \n\t" // Vector for accummulating column 4 +" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.7 +" dup z26.d, #0 \n\t" // Vector for accummulating column 5 +" dup z27.d, #0 \n\t" // Vector for accummulating column 5 +" \n\t" +" dup z28.d, #0 \n\t" // Vector for accummulating column 6 +" dup z29.d, #0 \n\t" // Vector for accummulating column 6 +" dup z30.d, #0 \n\t" // Vector for accummulating column 7 +" dup z31.d, #0 \n\t" // Vector for accummulating column 7 +" \n\t" +" \n\t" +" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. +" beq .DCONSIDERKLEFT \n\t" +" \n\t" +" add x0, x0, #64 \n\t" //update address of A +" add x1, x1, #64 \n\t" //update address of B +" \n\t" +" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. +" beq .DLASTITER \n\t" // (as loop is do-while-like). +" \n\t" +" DLOOP: \n\t" // Body +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #448] \n\t" // prefetch b row no.8, 512-64=448 +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #512] \n\t" // prefetch b row no.9 +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" prfm PLDL1KEEP, [x1, #576] \n\t" // prefetch b row no.10 +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 1 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #640] \n\t" // prefetch b row no.11 +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.8 +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" prfm PLDL1KEEP, [x0, #512] \n\t" // prefetch a col. no.9 +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" +" \n\t" //End it 2 +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #576] \n\t" // prefetch a col. no.10 +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #640] \n\t" // prefetch a col. no.11 +" \n\t" +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" \n\t" +" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" \n\t" // in range -128 to 112 +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 3 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z0, [x0, #6, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z1, [x0, #7, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" //End it 4 +" add x0, x0, #256 \n\t" +" add x1, x1, #128 \n\t" +" \n\t" +" sub x5,x5,1 \n\t" // i-=1 +" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. +" bne DLOOP \n\t" +" \n\t" +".DLASTITER: \n\t" +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 1 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" +" \n\t" //End it 2 +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" \n\t" // in range -128 to 112 +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 3 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" \n\t" +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" \n\t" +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" \n\t" +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" \n\t" +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" add x1, x1, #64 \n\t" +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" \n\t" +" \n\t" //End it 4 +" add x0, x0, #192 \n\t" +" \n\t" +" .DCONSIDERKLEFT: \n\t" +" cmp x6,0 \n\t" // If k_left == 0, we are done. +" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +" \n\t" +".DLOOPKLEFT: \n\t" +" \n\t" +" ldr z0, [x0] \n\t" // Load a +" ldr z1, [x0, #1, MUL VL] \n\t" +" add x0, x0, #64 \n\t" +" \n\t" +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" add x1, x1, #64 \n\t" +" \n\t" +" sub x6,x6,1 \n\t" +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" \n\t" +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" \n\t" +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" \n\t" +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" \n\t" +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" \n\t" +" cmp x6,0 \n\t" // Iterate again. +" bne .DLOOPKLEFT \n\t" // if i!=0. +" \n\t" +" .DPOSTACCUM: \n\t" +" \n\t" +" ld1rd {z6.d}, p0/z, [x7] \n\t" // Load alpha. +" ld1rd {z7.d}, p0/z, [x8] \n\t" // Load beta +" \n\t" +" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" bne .DGENSTORED \n\t" +" \n\t" +" .DCOLSTORED: \n\t" // C is column-major. +" \n\t" +" dup z0.d, #0 \n\t" +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z0, [x2] \n\t" //Load column 0 of C +" ldr z1, [x2, #1, MUL VL] \n\t" +" \n\t" +" ldr z2, [x20] \n\t" //Load column 1 of C +" ldr z3, [x20, #1, MUL VL] \n\t" +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS1: \n\t" +" \n\t" +" fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z0, [x2] \n\t" //Store column 0 of C +" str z1, [x2, #1, MUL VL] \n\t" +" \n\t" +" str z2, [x20] \n\t" //Store column 1 of C +" str z3, [x20, #1, MUL VL] \n\t" +" \n\t" +" dup z8.d, #0 \n\t" +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z8, [x21] \n\t" //Load column 2 of C +" ldr z9, [x21, #1, MUL VL] \n\t" +" \n\t" +" ldr z10, [x22] \n\t" //Load column 3 of C +" ldr z11, [x22, #1, MUL VL] \n\t" +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z8, [x21] \n\t" //Store column 2 of C +" str z9, [x21, #1, MUL VL] \n\t" +" \n\t" +" str z10, [x22] \n\t" //Store column 3 of C +" str z11, [x22, #1, MUL VL] \n\t" +" \n\t" +" dup z0.d, #0 \n\t" +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z0, [x23] \n\t" //Load column 4 of C +" ldr z1, [x23, #1, MUL VL] \n\t" +" \n\t" +" ldr z2, [x24] \n\t" //Load column 5 of C +" ldr z3, [x24, #1, MUL VL] \n\t" +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z0, [x23] \n\t" //Store column 4 of C +" str z1, [x23, #1, MUL VL] \n\t" +" \n\t" +" str z2, [x24] \n\t" //Store column 5 of C +" str z3, [x24, #1, MUL VL] \n\t" +" \n\t" +" dup z8.d, #0 \n\t" +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z8, [x25] \n\t" //Load column 6 of C +" ldr z9, [x25, #1, MUL VL] \n\t" +" \n\t" +" ldr z10, [x26] \n\t" //Load column 7 of C +" ldr z11, [x26, #1, MUL VL] \n\t" +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z8, [x25] \n\t" //Store column 6 of C +" str z9, [x25, #1, MUL VL] \n\t" +" \n\t" +" str z10, [x26] \n\t" //Store column 7 of C +" str z11, [x26, #1, MUL VL] \n\t" +" \n\t" +" b .DEND \n\t" +" \n\t" +" .DGENSTORED: \n\t" // C is general-stride stored. +" \n\t" +" \n\t" // x14 is row-stride in number of bytes. +" lsl x15,x14,#2 \n\t" // x15 is 4-row-stride, which is the address offset +" \n\t" // btw c(4,*) and c(0,*) +" index z4.d, xzr, x14 \n\t" // z4 is address offsets of four contiguous elements +" \n\t" // in a column. such as c( 0:3,* ). +" \n\t" // z4 is used as vector index for gather/scatter +" \n\t" // loading/storing from column of *c +" \n\t" +" \n\t" // C's each column's address: +" \n\t" // x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7) +" \n\t" // x5, x6, x7, x8, x16, x17, x18, x19: are addresses of c(4,0:7) +" add x5, x15, x2 \n\t" // x5 is address of c(4,0) +" add x6, x15, x20 \n\t" // x6 is address of c(4,1) +" add x7, x15, x21 \n\t" // x7 is address of c(4,2) +" add x8, x15, x22 \n\t" // x8 is address of c(4,3) +" add x16, x15, x23 \n\t" // x16 is address of c(4,4) +" add x17, x15, x24 \n\t" // x17 is address of c(4,5) +" add x18, x15, x25 \n\t" // x18 is address of c(4,6) +" add x19, x15, x26 \n\t" // x19 is address of c(4,7) +" \n\t" +" dup z0.d, #0 \n\t" // C column 0, 1 +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x2 is address of c(0,0) +" \n\t" // x5 is address of c(4,0) +" \n\t" // x20 is address of c(0,1) +" \n\t" // x6 is address of c(4,1) +" ld1d {z0.d}, p0/z, [x2, z4.d] \n\t" // Load c( 0:3,0 ) into z0 +" ld1d {z1.d}, p0/z, [x5, z4.d] \n\t" // Load c( 4:7,0 ) into z1 +" ld1d {z2.d}, p0/z, [x20, z4.d] \n\t" // Load c( 0:3,1 ) into z2 +" ld1d {z3.d}, p0/z, [x6 , z4.d] \n\t" // Load c( 4:7,1 ) into z3 +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS1: \n\t" +" \n\t" +" fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z0.d}, p0, [x2 , z4.d] \n\t" // Store c( 0:3,0 ) <- z0 +" st1d {z1.d}, p0, [x5 , z4.d] \n\t" // Store c( 4:7,0 ) <- z1 +" st1d {z2.d}, p0, [x20, z4.d] \n\t" // Store c( 0:3,1 ) <- z2 +" st1d {z3.d}, p0, [x6 , z4.d] \n\t" // Store c( 4:7,1 ) <- z3 +" \n\t" +" \n\t" +" \n\t" +" dup z8.d, #0 \n\t" // C column 2, 3 +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x21 is address of c(0,2) +" \n\t" // x7 is address of c(4,2) +" \n\t" // x22 is address of c(0,3) +" \n\t" // x8 is address of c(4,3) +" ld1d {z8.d}, p0/z, [x21, z4.d] \n\t" // Load c( 0:3,2 ) into z8 +" ld1d {z9.d}, p0/z, [x7 , z4.d] \n\t" // Load c( 4:7,2 ) into z9 +" ld1d {z10.d}, p0/z, [x22, z4.d] \n\t" // Load c( 0:3,3 ) into z10 +" ld1d {z11.d}, p0/z, [x8 , z4.d] \n\t" // Load c( 4:7,3 ) into z11 +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z8.d}, p0, [x21, z4.d] \n\t" // Store c( 0:3,2 ) <- z8 +" st1d {z9.d}, p0, [x7 , z4.d] \n\t" // Store c( 4:7,2 ) <- z9 +" st1d {z10.d}, p0, [x22, z4.d] \n\t" // Store c( 0:3,3 ) <- z10 +" st1d {z11.d}, p0, [x8 , z4.d] \n\t" // Store c( 4:7,3 ) <- z11 +" \n\t" +" dup z0.d, #0 \n\t" // C column 4, 5 +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x23 is address of c(0,4) +" \n\t" // x16 is address of c(4,4) +" \n\t" // x24 is address of c(0,5) +" \n\t" // x17 is address of c(4,5) +" ld1d {z0.d}, p0/z, [x23, z4.d] \n\t" // Load c( 0:3,4 ) into z0 +" ld1d {z1.d}, p0/z, [x16, z4.d] \n\t" // Load c( 4:7,4 ) into z1 +" ld1d {z2.d}, p0/z, [x24, z4.d] \n\t" // Load c( 0:3,5 ) into z2 +" ld1d {z3.d}, p0/z, [x17, z4.d] \n\t" // Load c( 4:7,5 ) into z3 +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z0.d}, p0, [x23, z4.d] \n\t" // Store c( 0:3,4 ) <- z0 +" st1d {z1.d}, p0, [x16, z4.d] \n\t" // Store c( 4:7,4 ) <- z1 +" st1d {z2.d}, p0, [x24, z4.d] \n\t" // Store c( 0:3,5 ) <- z2 +" st1d {z3.d}, p0, [x17, z4.d] \n\t" // Store c( 4:7,5 ) <- z3 +" \n\t" +" dup z8.d, #0 \n\t" // C column 6, 7 +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x25 is address of c(0,6) +" \n\t" // x18 is address of c(4,6) +" \n\t" // x26 is address of c(0,7) +" \n\t" // x19 is address of c(4,7) +" ld1d {z8.d}, p0/z, [x25, z4.d] \n\t" // Load c( 0:3,6 ) into z8 +" ld1d {z9.d}, p0/z, [x18, z4.d] \n\t" // Load c( 4:7,6 ) into z9 +" ld1d {z10.d}, p0/z, [x26, z4.d] \n\t" // Load c( 0:3,7 ) into z10 +" ld1d {z11.d}, p0/z, [x19, z4.d] \n\t" // Load c( 4:7,7 ) into z11 +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z8.d}, p0, [x25, z4.d] \n\t" // Store c( 0:3,6 ) <- z8 +" st1d {z9.d}, p0, [x18, z4.d] \n\t" // Store c( 4:7,6 ) <- z9 +" st1d {z10.d}, p0, [x26, z4.d] \n\t" // Store c( 0:3,7 ) <- z10 +" st1d {z11.d}, p0, [x19, z4.d] \n\t" // Store c( 4:7,7 ) <- z11 +" \n\t" +" .DEND: \n\t" // Done! +" \n\t" +:// output operands (none) +:// input operands + [aaddr] "m" (a), // 0 + [baddr] "m" (b), // 1 + [caddr] "m" (c), // 2 + [k_iter] "m" (k_iter), // 3 + [k_left] "m" (k_left), // 4 + [alpha] "m" (alpha), // 5 + [beta] "m" (beta), // 6 + [rs_c] "m" (rs_c), // 6 + [cs_c] "m" (cs_c), // 7 + [a_next] "m" (a_next), // 8 + [b_next] "m" (b_next) // 9 +:// Register clobber list + "x0","x1","x2","x3", + "x4","x5","x6", + "x7","x8","x9", + "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19", + "x20","x21","x22","x23","x24","x25","x26", + "x27", + "v0","v1","v2", + "v3","v4","v5", + "v6","v7","v8", + "v9","v10","v11", + "v12","v13","v14", + "v15","v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" +); + +} diff --git a/attic/windows/build/bli_kernel.h b/kernels/armsve/bli_kernels_armsve.h similarity index 93% rename from attic/windows/build/bli_kernel.h rename to kernels/armsve/bli_kernels_armsve.h index daca58e45..a5934312a 100644 --- a/attic/windows/build/bli_kernel.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -32,12 +32,6 @@ */ -#ifndef BLIS_KERNEL_H -#define BLIS_KERNEL_H - - - - - -#endif +GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) +PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk ) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 8f5ec76f6..c01c67f5a 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -108,13 +108,6 @@ __asm__ volatile " add x25,x24,x10 \n\t" //Load address Column 10 of C " add x26,x25,x10 \n\t" //Load address Column 11 of C " \n\t" -" ldr q0, [x0] \n\t" -" ldr q1, [x0, #16] \n\t" // Load a -" \n\t" -" ldr q2, [x1] \n\t" // Load b -" ldr q3, [x1, #16] \n\t" -" ldr q4, [x1, #32] \n\t" -" \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x16] \n\t" // Prefetch c. " prfm pldl1keep,[x17] \n\t" // Prefetch c. @@ -164,8 +157,15 @@ __asm__ volatile " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .SCONSIDERKLEFT \n\t" " \n\t" -"add x0, x0, #32 \n\t" //update address of A -"add x1, x1, #48 \n\t" //update address of B +" ldr q0, [x0] \n\t" +" ldr q1, [x0, #16] \n\t" // Load a +" \n\t" +" ldr q2, [x1] \n\t" // Load b +" ldr q3, [x1, #16] \n\t" +" ldr q4, [x1, #32] \n\t" +" \n\t" +" add x0, x0, #32 \n\t" //update address of A +" add x1, x1, #48 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .SLASTITER \n\t" // (as loop is do-while-like). @@ -1166,15 +1166,6 @@ __asm__ volatile " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" ldr q0, [x0] \n\t" -" ldr q1, [x0, #16] \n\t" // Load a -" ldr q2, [x0, #32] \n\t" -" \n\t" -" ldr q3, [x1] \n\t" // Load b -" ldr q4, [x1, #16] \n\t" -" ldr q5, [x1, #32] \n\t" -" ldr q6, [x1, #48] \n\t" -" \n\t" " dup v8.2d, xzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" " dup v9.2d, xzr \n\t" // Vector for accummulating column 0 @@ -1214,8 +1205,17 @@ __asm__ volatile " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .DCONSIDERKLEFT \n\t" " \n\t" -"add x0, x0, #48 \n\t" //update address of A -"add x1, x1, #64 \n\t" //update address of B +" ldr q0, [x0] \n\t" // Load a +" ldr q1, [x0, #16] \n\t" +" ldr q2, [x0, #32] \n\t" +" \n\t" +" ldr q3, [x1] \n\t" // Load b +" ldr q4, [x1, #16] \n\t" +" ldr q5, [x1, #32] \n\t" +" ldr q6, [x1, #48] \n\t" +" \n\t" +" add x0, x0, #48 \n\t" //update address of A +" add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .DLASTITER \n\t" // (as loop is do-while-like). diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c index 496649b50..4ece5af29 100644 --- a/kernels/zen/1/bli_amaxv_zen_int.c +++ b/kernels/zen/1/bli_amaxv_zen_int.c @@ -65,6 +65,38 @@ typedef union double d[2]; }v2dd_t; +// return a mask which indicates either: +// - v1 > v2 +// - v1 is NaN and v2 is not +// assumes that idx(v1) > idx(v2) +// all "OQ" comparisons false if either operand NaN +#define CMP256( dt, v1, v2 ) \ + _mm256_or_p##dt( _mm256_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* v1 > v2 || */ \ + _mm256_andnot_p##dt( _mm256_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \ + _mm256_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) */ \ + ) \ + ); + +// return a mask which indicates either: +// - v1 > v2 +// - v1 is NaN and v2 is not +// - v1 == v2 (maybe == NaN) and i1 < i2 +// all "OQ" comparisons false if either operand NaN +#define CMP128( dt, v1, v2, i1, i2 ) \ + _mm_or_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* ( v1 > v2 || */ \ + _mm_andnot_p##dt( _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \ + _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) ) || */ \ + ) \ + ), \ + _mm_and_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_EQ_OQ ), /* ( ( v1 == v2 || */ \ + _mm_and_p##dt( _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ), /* ( isnan(v1) && */ \ + _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ) /* isnan(v2) ) ) && */ \ + ) \ + ), \ + _mm_cmp_p##dt( i1, i2, _CMP_LT_OQ ) /* i1 < i2 ) */ \ + ) \ + ); + // ----------------------------------------------------------------------------- void bli_samaxv_zen_int @@ -122,8 +154,8 @@ void bli_samaxv_zen_int the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ - if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) ) + behavior mimics that of LAPACK's i?amax(). */ + if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; @@ -157,7 +189,7 @@ void bli_samaxv_zen_int // Get the absolute value of the vector element. x_vec.v = _mm256_andnot_ps( sign_mask.v, x_vec.v ); - mask_vec.v = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS ); + mask_vec.v = CMP256( s, x_vec.v, max_vec.v ); max_vec.v = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v ); maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v ); @@ -166,33 +198,34 @@ void bli_samaxv_zen_int x += num_vec_elements; } - max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 ); - max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 ); - mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS ); - - max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); - + max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 ); + max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 ); - maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); - - max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 ); - maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 ); - mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS ); + + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); - if ( max_vec_lo.f[0] > max_vec_lo.f[1] ) - { - abs_chi1_max = max_vec_lo.f[0]; - i_max_l = maxInx_vec_lo.f[0]; - } - else - { - abs_chi1_max = max_vec_lo.f[1]; - i_max_l = maxInx_vec_lo.f[1]; - } + max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 ); + maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 ); + + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); + + max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); + maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); + + max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 1 ); + maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 ); + + mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); + + max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); + maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); + + abs_chi1_max = max_vec_lo.f[0]; + i_max_l = maxInx_vec_lo.f[0]; for ( i = n - n_left; i < n; i++ ) { @@ -208,8 +241,8 @@ void bli_samaxv_zen_int the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ - if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) ) + behavior mimics that of LAPACK's i?amax(). */ + if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; @@ -286,8 +319,8 @@ void bli_damaxv_zen_int the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ - if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) ) + behavior mimics that of LAPACK's i?amax(). */ + if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; @@ -321,7 +354,7 @@ void bli_damaxv_zen_int // Get the absolute value of the vector element. x_vec.v = _mm256_andnot_pd( sign_mask.v, x_vec.v ); - mask_vec.v = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS ); + mask_vec.v = CMP256( d, x_vec.v, max_vec.v ); max_vec.v = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v ); maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v ); @@ -330,26 +363,26 @@ void bli_damaxv_zen_int x += num_vec_elements; } - max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 ); - max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 ); - mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS ); - - max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); - + max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 ); + max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 ); + + mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); + + max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); + maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); + + max_vec_hi.v = _mm_permute_pd( max_vec_lo.v, 1 ); + maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 ); + + mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); + + max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); - if ( max_vec_lo.d[0] > max_vec_lo.d[1] ) - { - abs_chi1_max = max_vec_lo.d[0]; - i_max_l = maxInx_vec_lo.d[0]; - } - else - { - abs_chi1_max = max_vec_lo.d[1]; - i_max_l = maxInx_vec_lo.d[1]; - } + abs_chi1_max = max_vec_lo.d[0]; + i_max_l = maxInx_vec_lo.d[0]; for ( i = n - n_left; i < n; i++ ) { @@ -363,10 +396,9 @@ void bli_damaxv_zen_int /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is - encountered, then treat it the same as if it were a valid - value that was smaller than any previously seen. This - behavior mimics that of LAPACK's ?lange(). */ - if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) ) + encountered, return the index of the first NaN. This + behavior mimics that of LAPACK's i?amax(). */ + if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c index ca584213e..169180f3b 100644 --- a/ref_kernels/1/bli_amaxv_ref.c +++ b/ref_kernels/1/bli_amaxv_ref.c @@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ - if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \ { \ abs_chi1_max = abs_chi1; \ i_max_l = i; \ @@ -129,7 +129,7 @@ void PASTEMAC3(ch,opname,arch,suf) \ encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ - if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ + if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \ { \ abs_chi1_max = abs_chi1; \ i_max_l = i; \ diff --git a/sandbox/ref99/packm/blx_l3_packm.c b/sandbox/ref99/packm/blx_l3_packm.c index 16df18c3c..4ec1ac108 100644 --- a/sandbox/ref99/packm/blx_l3_packm.c +++ b/sandbox/ref99/packm/blx_l3_packm.c @@ -51,7 +51,7 @@ void blx_l3_packm siz_t size_needed; // FGVZ: Not sure why we need this barrier, but we do. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); // Every thread initializes x_pack and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t @@ -102,7 +102,7 @@ void blx_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the contents of the chief thread's local mem_t entry to the // mem_t field in this thread's control tree node. @@ -142,7 +142,7 @@ void blx_l3_packm // Broadcast the address of the chief thread's local mem_t entry to // all threads. - local_mem_p = bli_thread_obroadcast( thread, &local_mem_s ); + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. @@ -155,7 +155,7 @@ void blx_l3_packm // will already have the cached values in their local control // trees' mem_t entries, currently pointed to by cntl_mem_p. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); } } @@ -178,6 +178,6 @@ void blx_l3_packm ); // Barrier so that packing is done before computation. - bli_thread_obarrier( thread ); + bli_thread_barrier( thread ); } diff --git a/sandbox/ref99/vars/blx_gemm_blk_var3.c b/sandbox/ref99/vars/blx_gemm_blk_var3.c index 7eace4af8..6e8786268 100644 --- a/sandbox/ref99/vars/blx_gemm_blk_var3.c +++ b/sandbox/ref99/vars/blx_gemm_blk_var3.c @@ -73,7 +73,7 @@ void blx_gemm_blk_var3 bli_thrinfo_sub_node( thread ) ); - bli_thread_obarrier( bli_thrinfo_sub_node( thread ) ); + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it diff --git a/test/1m4m/runme.sh b/test/1m4m/runme.sh index d79d53925..881cf4776 100755 --- a/test/1m4m/runme.sh +++ b/test/1m4m/runme.sh @@ -7,18 +7,22 @@ delay=0.1 #sys="blis" #sys="stampede2" -sys="lonestar5" +#sys="lonestar5" #sys="ul252" -#sys="ul264" +sys="ul264" # Bind threads to processors. #export OMP_PROC_BIND=true #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103" +# Most systems don't run the executables through anything else, but ul264 +# uses numactl. +runcmd="" + if [ ${sys} = "blis" ]; then - export GOMP_CPU_AFFINITY="0 1 2 3" + export GOMP_CPU_AFFINITY="0-3" threads="jc1ic1jr1_2400 jc2ic3jr2_6000 @@ -35,7 +39,7 @@ elif [ ${sys} = "stampede2" ]; then elif [ ${sys} = "lonestar5" ]; then - export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" + export GOMP_CPU_AFFINITY="0-23" # A hack to use libiomp5 with gcc. #export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" @@ -45,12 +49,11 @@ elif [ ${sys} = "lonestar5" ]; then # jc4ic3jr2_9600" threads="jc1ic1jr1_2400 jc4ic3jr2_7200" - threads="jc4ic3jr2_7200" elif [ ${sys} = "ul252" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" - export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51" + export GOMP_CPU_AFFINITY="0-51" threads="jc1ic1jr1_2400 jc2ic13jr1_6000 @@ -59,12 +62,14 @@ elif [ ${sys} = "ul252" ]; then elif [ ${sys} = "ul264" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" - export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63" + export GOMP_CPU_AFFINITY="0-63" + #threads="jc1ic1jr1_2400" threads="jc1ic1jr1_2400 - jc1ic8jr4_6000 - jc2ic8jr4_8000" + jc1ic8jr4_4800 + jc2ic8jr4_7200" + #runcmd="numactl -i all" fi # Datatypes to test. @@ -75,34 +80,11 @@ test_dts="s d c z" test_ops="gemm" # Implementations to test. -#impls="blis" -#impls="other" -#impls="eigen" -impls="all" - -if [ "${impls}" = "blis" ]; then - - test_impls="asm_blis" - -elif [ "${impls}" = "eigen" ]; then - - test_impls="eigen" - -elif [ "${impls}" = "other" ]; then - - test_impls="openblas vendor" - -elif [ "${impls}" = "eigen" ]; then - - test_impls="eigen" - -else - - test_impls="openblas vendor asm_blis 4m1a_blis 1m_blis" - #test_impls="openblas" - #test_impls="asm_blis 4m1a_blis 1m_blis" - #test_impls="asm_blis 1m_blis" -fi +#test_impls="openblas vendor asm_blis 1m_blis 4m1a_blis" +#test_impls="asm_blis 1m_blis 4m1a_blis" +#test_impls="asm_blis" +#test_impls="4m1a_blis" +test_impls="asm_blis 4m1a_blis 1m_blis" # Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can # restore the value. @@ -181,7 +163,9 @@ for th in ${threads}; do # Set the threading parameters based on the implementation # that we are preparing to run. - if [ "${im}" = "asm_blis" ]; then + if [ "${im}" = "asm_blis" ] || \ + [ "${im}" = "1m_blis" ] || \ + [ "${im}" = "4m1a_blis" ]; then unset OMP_NUM_THREADS export BLIS_JC_NT=${jc_nt} export BLIS_PC_NT=${pc_nt} @@ -228,10 +212,12 @@ for th in ${threads}; do out_file="${out_root}_${suf}_${dt}${op}_${im}.m" #echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" - echo "Running ./${exec_name} > ${out_file}" + echo "Running: ${runcmd} ./${exec_name} > ${out_file}" # Run executable. - ./${exec_name} > ${out_file} + #./${exec_name} > ${out_file} + #numactl -i all ./${exec_name} > ${out_file} + eval "${runcmd} ./${exec_name} > ${out_file}" sleep ${delay} diff --git a/test/sup/Makefile b/test/sup/Makefile index 2cd062747..5004aff77 100644 --- a/test/sup/Makefile +++ b/test/sup/Makefile @@ -45,12 +45,13 @@ # --- Makefile PHONY target definitions ---------------------------------------- # -.PHONY: all all-st all-mt \ - blis blis-st blis-mt \ +.PHONY: all \ + st mt \ + blissup-st blislpab-st eigen-st openblas-st vendor-st blasfeo-st libxsmm-st \ + blissup-mt blislpab-mt eigen-mt openblas-mt vendor-mt \ clean cleanx - # # --- Determine makefile fragment location ------------------------------------- # @@ -71,7 +72,6 @@ SHARE_PATH := ../.. endif - # # --- Include common makefile definitions -------------------------------------- # @@ -80,7 +80,6 @@ endif -include $(SHARE_PATH)/common.mk - # # --- BLAS and LAPACK implementations ------------------------------------------ # @@ -141,15 +140,40 @@ VENDORP_LIB := $(MKLP_LIB) # --- Problem size definitions ------------------------------------------------- # -# Single core -PS_BEGIN := 4 -PS_MAX := 800 -PS_INC := 4 +# The problem size range specification is done separately for single-threaded +# and multithreaded execution. Within each threadedness scenario, we allow for +# separate range specifications for cases with: +# - 3L: three large/variable dimensions and no small/constant dimensions +# - 2L: two large/variable dimensions and one small/constant dimension +# - 1L: one large/variable dimension and two small/constant dimensions -# Multicore -P1_BEGIN := 120 -P1_MAX := 6000 -P1_INC := 120 +# -- Single-threaded -- + +PS_BEGIN_3L := 2 +PS_MAX_3L := 400 +PS_INC_3L := 2 + +PS_BEGIN_2L := 4 +PS_MAX_2L := 800 +PS_INC_2L := 4 + +PS_BEGIN_1L := 32 +PS_MAX_1L := 6400 +PS_INC_1L := 32 + +# -- Multithreaded -- + +P1_BEGIN_3L := 4 +P1_MAX_3L := 800 +P1_INC_3L := 4 + +P1_BEGIN_2L := 8 +P1_MAX_2L := 1600 +P1_INC_2L := 8 + +P1_BEGIN_1L := 64 +P1_MAX_1L := 12800 +P1_INC_1L := 64 # @@ -186,21 +210,21 @@ CXXFLAGS += -I$(EIGEN_INC) CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) CXXFLAGS_MT := -march=native $(CXXFLAGS) -# Single or multithreaded string +# Single or multithreaded string. STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Number of trials per problem size. N_TRIALS := -DN_TRIALS=3 -# Problem size specification -PDEF_ST := -DP_BEGIN=$(PS_BEGIN) \ - -DP_MAX=$(PS_MAX) \ - -DP_INC=$(PS_INC) +# Problem size specification. +PDEF_ST_1L := -DP_BEGIN=$(PS_BEGIN_1L) -DP_MAX=$(PS_MAX_1L) -DP_INC=$(PS_INC_1L) +PDEF_ST_2L := -DP_BEGIN=$(PS_BEGIN_2L) -DP_MAX=$(PS_MAX_2L) -DP_INC=$(PS_INC_2L) +PDEF_ST_3L := -DP_BEGIN=$(PS_BEGIN_3L) -DP_MAX=$(PS_MAX_3L) -DP_INC=$(PS_INC_3L) -PDEF_MT := -DP_BEGIN=$(P1_BEGIN) \ - -DP_MAX=$(P1_MAX) \ - -DP_INC=$(P1_INC) +PDEF_MT_1L := -DP_BEGIN=$(P1_BEGIN_1L) -DP_MAX=$(P1_MAX_1L) -DP_INC=$(P1_INC_1L) +PDEF_MT_2L := -DP_BEGIN=$(P1_BEGIN_2L) -DP_MAX=$(P1_MAX_2L) -DP_INC=$(P1_INC_2L) +PDEF_MT_3L := -DP_BEGIN=$(P1_BEGIN_3L) -DP_MAX=$(P1_MAX_3L) -DP_INC=$(P1_INC_3L) ifeq ($(E),1) ERRCHK := -DERROR_CHECK @@ -241,9 +265,15 @@ SHAPES := l_l_s \ l_s_s \ l_l_l -SMS := 6 -SNS := 8 -SKS := 4 +# Define the small/constant m, n, and k dimensions for single core and multicore +# experiments. +SMS_ST := 6 +SNS_ST := 8 +SKS_ST := 4 + +SMS_MT := 6 +SNS_MT := 8 +SKS_MT := 10 # @@ -262,6 +292,15 @@ get-1of3 = $(word 1,$(subst _, ,$(1))) get-2of3 = $(word 2,$(subst _, ,$(1))) get-3of3 = $(word 3,$(subst _, ,$(1))) +# A function to return the correct PDEFS_ST variable given the shape string. +get-pdefs = $(strip $(subst l_l_l,$(PDEF_MT_3L), \ + $(subst l_l_s,$(PDEF_MT_2L), \ + $(subst l_s_l,$(PDEF_MT_2L), \ + $(subst s_l_l,$(PDEF_MT_2L), \ + $(subst s_s_l,$(PDEF_MT_1L), \ + $(subst s_l_s,$(PDEF_MT_1L), \ + $(subst l_s_s,$(PDEF_MT_1L),$(1))))))))) + # Datatype defs. get-dt-cpp = $(strip \ $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ @@ -338,33 +377,33 @@ XSTORS0 = ccc # --- Object and binary file definitons ---------------------------------------- # +# -- Single-threaded -- + get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o))))))) # Build a list of object files and binaries for each single-threaded # implementation using the get-st-objs() function defined above. -BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blissup) BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS)) -BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blislpab) BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS)) -EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),eigen) EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) -OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),openblas) OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) -BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo) +BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blasfeo) BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS)) -LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm) +LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),libxsmm) LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS)) -VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),vendor) VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) -#$(error "objs = $(EIGEN_ST_BINS)" ) - # Mark the object files as intermediate so that make will remove them # automatically after building the binaries on which they depend. .INTERMEDIATE: $(BLISSUP_ST_OBJS) \ @@ -375,24 +414,57 @@ VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) $(LIBXSMM_ST_OBJS) \ $(VENDOR_ST_OBJS) +# -- Multithreaded -- + +get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o))))))) + +# Build a list of object files and binaries for each multithreaded +# implementation using the get-st-objs() function defined above. +BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),blissup) +BLISSUP_MT_BINS := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS)) + +BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),blislpab) +BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS)) + +EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),eigen) +EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS)) + +OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),openblas) +OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS)) + +VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),vendor) +VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS)) + +#$(error "objs = $(EIGEN_ST_BINS)" ) + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLISSUP_MT_OBJS) \ + $(BLISLPAB_MT_OBJS) \ + $(EIGEN_MT_OBJS) \ + $(OPENBLAS_MT_OBJS) \ + $(VENDOR_MT_OBJS) + # -# --- Targets/rules ------------------------------------------------------------ +# --- High-level targets/rules ------------------------------------------------- # all: st -blissup: blissup-st -blislpab: blislpab-st -eigen: eigen-st -openblas: openblas-st -blasfeo: blasfeo-st -libxsmm: libxsmm-st -vendor: vendor-st +#blis: blissup-st blislpab-st +#blissup: blissup-st +#blislpab: blislpab-st +#eigen: eigen-st +#openblas: openblas-st +#blasfeo: blasfeo-st +#libxsmm: libxsmm-st +#vendor: vendor-st + +# -- Single-threaded -- st: blissup-st blislpab-st \ eigen-st openblas-st blasfeo-st libxsmm-st vendor-st -blis: blissup-st blislpab-st blissup-st: $(BLISSUP_ST_BINS) blislpab-st: $(BLISLPAB_ST_BINS) @@ -402,21 +474,35 @@ blasfeo-st: $(BLASFEO_ST_BINS) libxsmm-st: $(LIBXSMM_ST_BINS) vendor-st: $(VENDOR_ST_BINS) +# -- Multithreaded -- -# --Object file rules -- +mt: blissup-mt blislpab-mt \ + eigen-mt openblas-mt vendor-mt + +blissup-mt: $(BLISSUP_MT_BINS) +blislpab-mt: $(BLISLPAB_MT_BINS) +eigen-mt: $(EIGEN_MT_BINS) +openblas-mt: $(OPENBLAS_MT_BINS) +vendor-mt: $(VENDOR_MT_BINS) + + +# --- Object file rules -------------------------------------------------------- # Define the implementations for which we will instantiate compilation rules. -BIMPLS := blissup blislpab openblas blasfeo libxsmm vendor -EIMPLS := eigen +BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor +BIMPLS_MT := blissup blislpab openblas vendor +EIMPLS := eigen + +# -- Single-threaded BLAS -- # 1 2 3 4 567 8 # test_dgemm_nn_rrr_mpn6kp_blissup_st.x # Define the function that will be used to instantiate compilation rules -# for the various implementations. +# for the various single-threaded implementations. define make-st-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile - $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ endef # Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS @@ -425,17 +511,40 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ -$(foreach impl,$(BIMPLS), \ +$(foreach sm,$(SMS_ST), \ +$(foreach sn,$(SNS_ST), \ +$(foreach sk,$(SKS_ST), \ +$(foreach impl,$(BIMPLS_ST), \ $(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) +# -- Multithreaded BLAS -- + # Define the function that will be used to instantiate compilation rules -# for the various implementations. +# for the various multithreaded implementations. +define make-mt-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ +endef + +# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS +# implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS_MT), \ +$(foreach sn,$(SNS_MT), \ +$(foreach sk,$(SKS_MT), \ +$(foreach impl,$(BIMPLS_MT), \ +$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + +# -- Single-threaded Eigen -- + +# Define the function that will be used to instantiate compilation rules +# for the single-threaded Eigen implementation. define make-eigst-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile - $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ + $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ endef # Instantiate the rule function make-st-rule() for each Eigen implementation. @@ -443,20 +552,42 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ +$(foreach sm,$(SMS_ST), \ +$(foreach sn,$(SNS_ST), \ +$(foreach sk,$(SKS_ST), \ $(foreach impl,$(EIMPLS), \ $(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) +# -- Multithreaded Eigen -- -# -- Executable file rules -- +# Define the function that will be used to instantiate compilation rules +# for the multithreaded Eigen implementation. +define make-eigmt-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile + $(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each Eigen implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS_MT), \ +$(foreach sn,$(SNS_MT), \ +$(foreach sk,$(SKS_MT), \ +$(foreach impl,$(EIMPLS), \ +$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + + +# --- Executable file rules ---------------------------------------------------- # NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS # on the link command line in case BLIS was configured with the BLAS # compatibility layer. This prevents BLIS from inadvertently getting called # for the BLAS routines we are trying to test with. +# -- Single-threaded -- + test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) @@ -478,6 +609,23 @@ test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK) test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +# -- Multithreaded -- + +test_%_blissup_mt.x: test_%_blissup_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blislpab_mt.x: test_%_blislpab_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_eigen_mt.x: test_%_eigen_mt.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_vendor_mt.x: test_%_vendor_mt.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + # -- Clean rules -- diff --git a/test/sup/octave/runthese.m b/test/sup/octave/runthese.m deleted file mode 100644 index a9e053c3e..000000000 --- a/test/sup/octave/runthese.m +++ /dev/null @@ -1,12 +0,0 @@ - -% haswell -plot_panel_trxsh(3.25,16,1,'st','d','ccc',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.25,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; - -% kabylake -plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.80,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190823/4_800_4_mt201','kbl','MKL','matlab'); close; clear all; - -% epyc -plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.00, 8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; diff --git a/test/supmt/octave/gen_opsupnames.m b/test/sup/octave_mt/gen_opsupnames.m similarity index 74% rename from test/supmt/octave/gen_opsupnames.m rename to test/sup/octave_mt/gen_opsupnames.m index 2debd4142..a87c06cc2 100644 --- a/test/supmt/octave/gen_opsupnames.m +++ b/test/sup/octave_mt/gen_opsupnames.m @@ -12,14 +12,9 @@ for io = 1:nops op = ops( io, : ); - str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm ); - str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln ); - str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ); - str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ); - str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ); - str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln ); - str6 = sprintf( '%s_%s_mpnpkp', op, stor ); - + % NOTE: This way of sprintf'ing doesn't work when the string lengths + % vary, as they would if any of the constant dimensions is greater + % than 9. %opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm ) %opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln ) %opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ) @@ -28,13 +23,21 @@ for io = 1:nops %opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln ) %opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor ) - opsupnames( i+0, : ) = sprintf( '%-20s', str0 ); - opsupnames( i+1, : ) = sprintf( '%-20s', str1 ); - opsupnames( i+2, : ) = sprintf( '%-20s', str2 ); - opsupnames( i+3, : ) = sprintf( '%-20s', str3 ); - opsupnames( i+4, : ) = sprintf( '%-20s', str4 ); - opsupnames( i+5, : ) = sprintf( '%-20s', str5 ); - opsupnames( i+6, : ) = sprintf( '%-20s', str6 ); + str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm ); + str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln ); + str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ); + str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ); + str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ); + str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln ); + str6 = sprintf( '%s_%s_mpnpkp', op, stor ); + + opsupnames( i+0, : ) = sprintf( '%-22s', str0 ); + opsupnames( i+1, : ) = sprintf( '%-22s', str1 ); + opsupnames( i+2, : ) = sprintf( '%-22s', str2 ); + opsupnames( i+3, : ) = sprintf( '%-22s', str3 ); + opsupnames( i+4, : ) = sprintf( '%-22s', str4 ); + opsupnames( i+5, : ) = sprintf( '%-22s', str5 ); + opsupnames( i+6, : ) = sprintf( '%-22s', str6 ); opnames( i+0, : ) = sprintf( '%s', op ); opnames( i+1, : ) = sprintf( '%s', op ); diff --git a/test/sup/octave_mt/plot_l3sup_perf.m b/test/sup/octave_mt/plot_l3sup_perf.m new file mode 100644 index 000000000..43a05e87b --- /dev/null +++ b/test/sup/octave_mt/plot_l3sup_perf.m @@ -0,0 +1,274 @@ +function r_val = plot_l3sup_perf( opname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_vend, vend_str, ... + nth, ... + rows, cols, ... + cfreq, ... + dfps, ... + theid, impl ) + +%if ... %mod(theid-1,cols) == 2 || ... +% ... %mod(theid-1,cols) == 3 || ... +% ... %mod(theid-1,cols) == 4 || ... +% 0 == 1 ... %theid >= 19 +% show_plot = 0; +%else + show_plot = 1; +%end + +%legend_plot_id = 11; +legend_plot_id = 0*cols + 1*6; + +if 1 + ax1 = subplot( rows, cols, theid ); + hold( ax1, 'on' ); +end + +% Set line properties. +color_blissup = 'k'; lines_blissup = '-'; markr_blissup = ''; +color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = ''; +color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o'; +color_open = 'r'; lines_open = '--'; markr_open = 'o'; +color_vend = 'b'; lines_vend = '-.'; markr_vend = '.'; + +% Compute the peak performance in terms of the number of double flops +% executable per cycle and the clock rate. +if opname(1) == 's' || opname(1) == 'c' + flopspercycle = dfps * 2; +else + flopspercycle = dfps; +end +max_perf_core = (flopspercycle * cfreq) * 1; + +% Escape underscores in the title. +title_opname = strrep( opname, '_', '\_' ); + +% Print the title to a string. +titlename = '%s'; +titlename = sprintf( titlename, title_opname ); + +% Set the legend strings. +blissup_legend = sprintf( 'BLIS sup' ); +blislpab_legend = sprintf( 'BLIS conv' ); +eigen_legend = sprintf( 'Eigen' ); +open_legend = sprintf( 'OpenBLAS' ); +%vend_legend = sprintf( 'MKL' ); +%vend_legend = sprintf( 'ARMPL' ); +vend_legend = vend_str; + +% Set axes range values. +y_scale = 1.00; +x_begin = 0; +%x_end is set below. +y_begin = 0; +y_end = max_perf_core * y_scale; + +% Set axes names. +if nth == 1 + yaxisname = 'GFLOPS'; +else + yaxisname = 'GFLOPS/core'; +end + + +%flopscol = 4; +flopscol = size( data_blissup, 2 ); +msize = 5; +if 1 + fontsize = 12; +else + fontsize = 16; +end +linesize = 0.5; +legend_loc = 'southeast'; + +% -------------------------------------------------------------------- + +% Automatically detect a column with the increasing problem size. +% Then set the maximum x-axis value. +for psize_col = 1:3 + if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col ) + break; + end +end +x_axis( :, 1 ) = data_blissup( :, psize_col ); + +% Compute the number of data points we have in the x-axis. Note that we +% only use half the data points for the m = n = k column of graphs. +%if mod(theid-1,cols) == 6 +% np = size( data_blissup, 1 ) / 2; +%else +% np = size( data_blissup, 1 ); +%end +np = size( data_blissup, 1 ); + +% Grab the last x-axis value. +x_end = data_blissup( np, psize_col ); + +%data_peak( 1, 1:2 ) = [ 0 max_perf_core ]; +%data_peak( 2, 1:2 ) = [ x_end max_perf_core ]; + +if show_plot == 1 +blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +elseif theid == legend_plot_id +blissup_ln = line( nan, nan, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( nan, nan, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( nan, nan, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( nan, nan, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +vend_ln = line( nan, nan, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +end + + +xlim( ax1, [x_begin x_end] ); +ylim( ax1, [y_begin y_end] ); + +if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5 + if nth == 12 + ylim( ax1, [y_begin y_end/2] ); + elseif nth > 12 + ylim( ax1, [y_begin y_end/6] ); + end +end + +if 10000 <= x_end && x_end < 15000 + x_tick2 = x_end - 2000; + x_tick1 = x_tick2/2; + %xticks( ax1, [ x_tick1 x_tick2 ] ); + xticks( ax1, [ 4000 8000 12000 ] ); +elseif 6000 <= x_end && x_end < 10000 + x_tick2 = x_end - 2000; + x_tick1 = x_tick2/2; + %xticks( ax1, [ x_tick1 x_tick2 ] ); + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 4000 <= x_end && x_end < 6000 + x_tick2 = x_end - 1000; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 2000 <= x_end && x_end < 3000 + x_tick2 = x_end - 400; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 500 <= x_end && x_end < 1000 + x_tick3 = x_end*(3/4); + x_tick2 = x_end*(2/4); + x_tick1 = x_end*(1/4); + xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] ); +end + +if show_plot == 1 || theid == legend_plot_id + if theid == legend_plot_id + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + %set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl) + set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) + end + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + % xpos ypos + %set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl) + end +end + +set( ax1,'FontSize',fontsize ); +set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1. +box( ax1, 'on' ); + +titl = title( titlename ); +set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. + +% The default is to align the plot title across whole figure, not the box. +% This is a hack to nudge the title back to the center of the box. +if impl == 'octave' + tpos = get( titl, 'Position' ); + % For some reason, the titles in the graphs in the last column start + % off in a different relative position than the graphs in the other + % columns. Here, we manually account for that. + if mod(theid-1,cols) == 6 + tpos(1) = tpos(1) + -10; + else + tpos(1) = tpos(1) + -40; + end + set( titl, 'Position', tpos ); + set( titl, 'FontSize', fontsize ); +else % impl == 'matlab' + tpos = get( titl, 'Position' ); + tpos(1) = tpos(1) + 90; + set( titl, 'Position', tpos ); +end + +if theid > (rows-1)*cols + %xlab = xlabel( ax1,xaxisname ); + %tpos = get( xlab, 'Position' ) + %tpos(2) = tpos(2) + 10; + %set( xlab, 'Position', tpos ); + if theid == rows*cols - 6 + xlab = xlabel( ax1, 'm = 6; n = k' ); + elseif theid == rows*cols - 5 + xlab = xlabel( ax1, 'n = 8; m = k' ); + elseif theid == rows*cols - 4 + xlab = xlabel( ax1, 'k = 10; m = n' ); + elseif theid == rows*cols - 3 + xlab = xlabel( ax1, 'm; n = 8, k = 10' ); + elseif theid == rows*cols - 2 + xlab = xlabel( ax1, 'n; m = 6, k = 10' ); + elseif theid == rows*cols - 1 + xlab = xlabel( ax1, 'k; m = 6, n = 8' ); + elseif theid == rows*cols - 0 + xlab = xlabel( ax1, 'm = n = k' ); + end +end + +if mod(theid-1,cols) == 0 + ylab = ylabel( ax1,yaxisname ); +end + +r_val = 0; + diff --git a/test/supmt/octave/plot_panel_trxsh.m b/test/sup/octave_mt/plot_panel_trxsh.m similarity index 97% rename from test/supmt/octave/plot_panel_trxsh.m rename to test/sup/octave_mt/plot_panel_trxsh.m index cf5e860a5..b9fac8ff9 100644 --- a/test/supmt/octave/plot_panel_trxsh.m +++ b/test/sup/octave_mt/plot_panel_trxsh.m @@ -54,7 +54,7 @@ if 1 == 1 set(gcf,'PaperPosition', [0 0 11.5 20.4]); set(gcf,'PaperPositionMode','manual'); else % impl == 'octave' % octave 4.x - set(gcf,'PaperSize', [12 21.5]); + set(gcf,'PaperSize', [12 22.0]); set(gcf,'PaperPositionMode','auto'); end set(gcf,'PaperOrientation','landscape'); @@ -145,8 +145,8 @@ outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth ); %print(gcf, 'gemm_md','-fillpage','-dpdf'); %print(gcf, outfile,'-bestfit','-dpdf'); if impl == 'octave' -print(gcf, outfile); + print(gcf, outfile); else % if impl == 'matlab' -print(gcf, outfile,'-bestfit','-dpdf'); + print(gcf, outfile,'-bestfit','-dpdf'); end diff --git a/test/sup/octave_mt/runthese.m b/test/sup/octave_mt/runthese.m new file mode 100644 index 000000000..3d9afdfe4 --- /dev/null +++ b/test/sup/octave_mt/runthese.m @@ -0,0 +1,8 @@ +% kabylake +plot_panel_trxsh(3.80,16,4,'mt','d','rrr',[ 6 8 10 ],'../results/kabylake/20200302/mnkt100000_mt4','kbl','MKL','octave'); close; clear all; + +% haswell +plot_panel_trxsh(3.1,16,12,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20200302/mnkt100000_mt12','has','MKL','octave'); close; clear all; + +% epyc +plot_panel_trxsh(2.55,8,32,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20200302/mnkt100000_mt32','epyc','MKL','octave'); close; clear all; diff --git a/test/sup/octave/gen_opsupnames.m b/test/sup/octave_st/gen_opsupnames.m similarity index 99% rename from test/sup/octave/gen_opsupnames.m rename to test/sup/octave_st/gen_opsupnames.m index a1a226a76..b70c8a12a 100644 --- a/test/sup/octave/gen_opsupnames.m +++ b/test/sup/octave_st/gen_opsupnames.m @@ -34,4 +34,3 @@ end r_val1 = opsupnames; r_val2 = opnames; -end diff --git a/test/sup/octave_st/plot_l3sup_perf.m b/test/sup/octave_st/plot_l3sup_perf.m new file mode 100644 index 000000000..8a615ada5 --- /dev/null +++ b/test/sup/octave_st/plot_l3sup_perf.m @@ -0,0 +1,328 @@ +function r_val = plot_l3sup_perf( opname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_bfeo, ... + data_xsmm, ... + data_vend, vend_str, ... + nth, ... + rows, cols, ... + cfreq, ... + dfps, ... + theid, impl ) + +%if ... %mod(theid-1,cols) == 2 || ... +% ... %mod(theid-1,cols) == 3 || ... +% ... %mod(theid-1,cols) == 4 || ... +% 0 == 1 ... %theid >= 19 +% show_plot = 0; +%else + show_plot = 1; +%end + +%legend_plot_id = 11; +legend_plot_id = 2*cols + 1*5; + +if 1 + ax1 = subplot( rows, cols, theid ); + hold( ax1, 'on' ); +end + +% Set line properties. +color_blissup = 'k'; lines_blissup = '-'; markr_blissup = ''; +color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = ''; +color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o'; +color_open = 'r'; lines_open = '--'; markr_open = 'o'; +color_bfeo = 'c'; lines_bfeo = '-'; markr_bfeo = 'o'; +color_xsmm = 'g'; lines_xsmm = '-'; markr_xsmm = 'o'; +color_vend = 'b'; lines_vend = '-.'; markr_vend = '.'; + +% Compute the peak performance in terms of the number of double flops +% executable per cycle and the clock rate. +if opname(1) == 's' || opname(1) == 'c' + flopspercycle = dfps * 2; +else + flopspercycle = dfps; +end +max_perf_core = (flopspercycle * cfreq) * 1; + +% Escape underscores in the title. +title_opname = strrep( opname, '_', '\_' ); + +% Print the title to a string. +titlename = '%s'; +titlename = sprintf( titlename, title_opname ); + +% Set the legend strings. +blissup_legend = sprintf( 'BLIS sup' ); +blislpab_legend = sprintf( 'BLIS conv' ); +eigen_legend = sprintf( 'Eigen' ); +open_legend = sprintf( 'OpenBLAS' ); +bfeo_legend = sprintf( 'BLASFEO' ); +xsmm_legend = sprintf( 'libxsmm' ); +%vend_legend = sprintf( 'MKL' ); +%vend_legend = sprintf( 'ARMPL' ); +vend_legend = vend_str; + +% Set axes range values. +y_scale = 1.00; +x_begin = 0; +%x_end is set below. +y_begin = 0; +y_end = max_perf_core * y_scale; + +% Set axes names. +if nth == 1 + yaxisname = 'GFLOPS'; +else + yaxisname = 'GFLOPS/core'; +end + + +%flopscol = 4; +flopscol = size( data_blissup, 2 ); +msize = 5; +if 1 + fontsize = 12; +else + fontsize = 16; +end +linesize = 0.5; +legend_loc = 'southeast'; + +% -------------------------------------------------------------------- + +% Automatically detect a column with the increasing problem size. +% Then set the maximum x-axis value. +for psize_col = 1:3 + if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col ) + break; + end +end +x_axis( :, 1 ) = data_blissup( :, psize_col ); + +% Compute the number of data points we have in the x-axis. Note that we +% only use half the data points for the m = n = k column of graphs. +%if mod(theid-1,cols) == 6 +% np = size( data_blissup, 1 ) / 2; +%else +% np = size( data_blissup, 1 ); +%end +np = size( data_blissup, 1 ); + +has_xsmm = 1; +if data_xsmm( 1, flopscol ) == 0.0 + has_xsmm = 0; +end + +% Grab the last x-axis value. +x_end = data_blissup( np, psize_col ); + +%data_peak( 1, 1:2 ) = [ 0 max_perf_core ]; +%data_peak( 2, 1:2 ) = [ x_end max_perf_core ]; + +if show_plot == 1 +blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ... + 'Color',color_bfeo, 'LineStyle',lines_bfeo, ... + 'LineWidth',linesize ); +if has_xsmm == 1 +xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); +else +xsmm_ln = line( nan, nan, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); +end +vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +elseif theid == legend_plot_id +blissup_ln = line( nan, nan, ... + 'Color',color_blissup, 'LineStyle',lines_blissup, ... + 'LineWidth',linesize ); +blislpab_ln = line( nan, nan, ... + 'Color',color_blislpab, 'LineStyle',lines_blislpab, ... + 'LineWidth',linesize ); +eigen_ln = line( nan, nan, ... + 'Color',color_eigen, 'LineStyle',lines_eigen, ... + 'LineWidth',linesize ); +open_ln = line( nan, nan, ... + 'Color',color_open, 'LineStyle',lines_open, ... + 'LineWidth',linesize ); +bfeo_ln = line( nan, nan, ... + 'Color',color_bfeo, 'LineStyle',lines_bfeo, ... + 'LineWidth',linesize ); +xsmm_ln = line( nan, nan, ... + 'Color',color_xsmm, 'LineStyle',lines_xsmm, ... + 'LineWidth',linesize ); +vend_ln = line( nan, nan, ... + 'Color',color_vend, 'LineStyle',lines_vend, ... + 'LineWidth',linesize ); +end + + +xlim( ax1, [x_begin x_end] ); +ylim( ax1, [y_begin y_end] ); + +if 10000 <= x_end && x_end < 15000 + x_tick2 = x_end - 2000; + x_tick1 = x_tick2/2; + %xticks( ax1, [ x_tick1 x_tick2 ] ); + xticks( ax1, [ 3000 6000 9000 12000 ] ); +elseif 6000 <= x_end && x_end < 10000 + x_tick2 = x_end - 2000; + x_tick1 = x_tick2/2; + %xticks( ax1, [ x_tick1 x_tick2 ] ); + xticks( ax1, [ 2000 4000 6000 8000 ] ); +elseif 4000 <= x_end && x_end < 6000 + x_tick2 = x_end - 1000; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 2000 <= x_end && x_end < 3000 + x_tick2 = x_end - 400; + x_tick1 = x_tick2/2; + xticks( ax1, [ x_tick1 x_tick2 ] ); +elseif 500 <= x_end && x_end < 1000 + x_tick3 = x_end*(3/4); + x_tick2 = x_end*(2/4); + x_tick1 = x_end*(1/4); + xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] ); +end + +if show_plot == 1 || theid == legend_plot_id + if nth == 1 && theid == legend_plot_id + if has_xsmm == 1 + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + bfeo_ln ... + xsmm_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + bfeo_legend, ... + xsmm_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + set( leg,'Position',[15.40 4.75 1.9 1.20] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-3 ); + set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl) + end + else + leg = legend( ... + [ ... + blissup_ln ... + blislpab_ln ... + eigen_ln ... + open_ln ... + bfeo_ln ... + vend_ln ... + ], ... + blissup_legend, ... + blislpab_legend, ... + eigen_legend, ... + open_legend, ... + bfeo_legend, ... + vend_legend, ... + 'Location', legend_loc ); + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + if impl == 'octave' + set( leg,'FontSize',fontsize ); + set( leg,'Position',[15.40 7.65 1.9 1.10] ); % (1,4tl) + else + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl) + end + end + set( leg,'Box','off' ); + set( leg,'Color','none' ); + set( leg,'Units','inches' ); + % xpos ypos + %set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl) + elseif nth > 1 && theid == legend_plot_id + end +end + +set( ax1,'FontSize',fontsize ); +set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1. +box( ax1, 'on' ); + +titl = title( titlename ); +set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. + +% The default is to align the plot title across whole figure, not the box. +% This is a hack to nudge the title back to the center of the box. +if impl == 'octave' + tpos = get( titl, 'Position' ); + % For some reason, the titles in the graphs in the last column start + % off in a different relative position than the graphs in the other + % columns. Here, we manually account for that. + if mod(theid-1,cols) == 6 + tpos(1) = tpos(1) + -10; + else + tpos(1) = tpos(1) + -40; + end + set( titl, 'Position', tpos ); + set( titl, 'FontSize', fontsize ); +else % impl == 'matlab' + tpos = get( titl, 'Position' ); + tpos(1) = tpos(1) + 90; + set( titl, 'Position', tpos ); +end + +if theid > (rows-1)*cols + %xlab = xlabel( ax1,xaxisname ); + %tpos = get( xlab, 'Position' ) + %tpos(2) = tpos(2) + 10; + %set( xlab, 'Position', tpos ); + if theid == rows*cols - 6 + xlab = xlabel( ax1, 'm = 6; n = k' ); + elseif theid == rows*cols - 5 + xlab = xlabel( ax1, 'n = 8; m = k' ); + elseif theid == rows*cols - 4 + xlab = xlabel( ax1, 'k = 4; m = n' ); + elseif theid == rows*cols - 3 + xlab = xlabel( ax1, 'm; n = 8, k = 4' ); + elseif theid == rows*cols - 2 + xlab = xlabel( ax1, 'n; m = 6, k = 4' ); + elseif theid == rows*cols - 1 + xlab = xlabel( ax1, 'k; m = 6, n = 8' ); + elseif theid == rows*cols - 0 + xlab = xlabel( ax1, 'm = n = k' ); + end +end + +if mod(theid-1,cols) == 0 + ylab = ylabel( ax1,yaxisname ); +end + +r_val = 0; + diff --git a/test/sup/octave/plot_panel_trxsh.m b/test/sup/octave_st/plot_panel_trxsh.m similarity index 97% rename from test/sup/octave/plot_panel_trxsh.m rename to test/sup/octave_st/plot_panel_trxsh.m index ebc216e3b..8ba709257 100644 --- a/test/sup/octave/plot_panel_trxsh.m +++ b/test/sup/octave_st/plot_panel_trxsh.m @@ -48,7 +48,7 @@ n_opsupnames = size( opsupnames, 1 ); if 1 == 1 %fig = figure('Position', [100, 100, 2400, 1500]); - fig = figure('Position', [100, 100, 2800, 1500]); + fig = figure('Position', [100, 100, 2400, 1200]); orient( fig, 'portrait' ); set(gcf,'PaperUnits', 'inches'); if impl == 'matlab' @@ -56,7 +56,7 @@ if 1 == 1 set(gcf,'PaperPosition', [0 0 11.5 20.4]); set(gcf,'PaperPositionMode','manual'); else % impl == 'octave' % octave 4.x - set(gcf,'PaperSize', [10 17.5]); + set(gcf,'PaperSize', [12 22.0]); set(gcf,'PaperPositionMode','auto'); end set(gcf,'PaperOrientation','landscape'); @@ -165,8 +165,8 @@ outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth ); %print(gcf, 'gemm_md','-fillpage','-dpdf'); %print(gcf, outfile,'-bestfit','-dpdf'); if impl == 'octave' -print(gcf, outfile); + print(gcf, outfile); else % if impl == 'matlab' -print(gcf, outfile,'-bestfit','-dpdf'); + print(gcf, outfile,'-bestfit','-dpdf'); end diff --git a/test/sup/octave/runme.m b/test/sup/octave_st/runme.m similarity index 100% rename from test/sup/octave/runme.m rename to test/sup/octave_st/runme.m diff --git a/test/sup/octave_st/runthese.m b/test/sup/octave_st/runthese.m new file mode 100644 index 000000000..86b738b2e --- /dev/null +++ b/test/sup/octave_st/runthese.m @@ -0,0 +1,8 @@ +% kabylake +plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20200302/mnkt100000_st','kbl','MKL','octave'); close; clear all; + +% haswell +plot_panel_trxsh(3.5,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20200302/mnkt100000_st','has','MKL','octave'); close; clear all; + +% epyc +plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20200302/mnkt100000_st','epyc','MKL','octave'); close; clear all; diff --git a/test/supmt/Makefile b/test/sup/old/supmt/Makefile similarity index 79% rename from test/supmt/Makefile rename to test/sup/old/supmt/Makefile index 2ed93565d..5004aff77 100644 --- a/test/supmt/Makefile +++ b/test/sup/old/supmt/Makefile @@ -45,12 +45,13 @@ # --- Makefile PHONY target definitions ---------------------------------------- # -.PHONY: all all-st all-mt \ - blis blis-st blis-mt \ +.PHONY: all \ + st mt \ + blissup-st blislpab-st eigen-st openblas-st vendor-st blasfeo-st libxsmm-st \ + blissup-mt blislpab-mt eigen-mt openblas-mt vendor-mt \ clean cleanx - # # --- Determine makefile fragment location ------------------------------------- # @@ -71,7 +72,6 @@ SHARE_PATH := ../.. endif - # # --- Include common makefile definitions -------------------------------------- # @@ -80,7 +80,6 @@ endif -include $(SHARE_PATH)/common.mk - # # --- BLAS and LAPACK implementations ------------------------------------------ # @@ -141,15 +140,40 @@ VENDORP_LIB := $(MKLP_LIB) # --- Problem size definitions ------------------------------------------------- # -# Single core -PS_BEGIN := 4 -PS_MAX := 800 -PS_INC := 4 +# The problem size range specification is done separately for single-threaded +# and multithreaded execution. Within each threadedness scenario, we allow for +# separate range specifications for cases with: +# - 3L: three large/variable dimensions and no small/constant dimensions +# - 2L: two large/variable dimensions and one small/constant dimension +# - 1L: one large/variable dimension and two small/constant dimensions -# Multicore -P1_BEGIN := 8 -P1_MAX := 1600 -P1_INC := 8 +# -- Single-threaded -- + +PS_BEGIN_3L := 2 +PS_MAX_3L := 400 +PS_INC_3L := 2 + +PS_BEGIN_2L := 4 +PS_MAX_2L := 800 +PS_INC_2L := 4 + +PS_BEGIN_1L := 32 +PS_MAX_1L := 6400 +PS_INC_1L := 32 + +# -- Multithreaded -- + +P1_BEGIN_3L := 4 +P1_MAX_3L := 800 +P1_INC_3L := 4 + +P1_BEGIN_2L := 8 +P1_MAX_2L := 1600 +P1_INC_2L := 8 + +P1_BEGIN_1L := 64 +P1_MAX_1L := 12800 +P1_INC_1L := 64 # @@ -186,21 +210,21 @@ CXXFLAGS += -I$(EIGEN_INC) CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) CXXFLAGS_MT := -march=native $(CXXFLAGS) -# Single or multithreaded string +# Single or multithreaded string. STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Number of trials per problem size. N_TRIALS := -DN_TRIALS=3 -# Problem size specification -PDEF_ST := -DP_BEGIN=$(PS_BEGIN) \ - -DP_MAX=$(PS_MAX) \ - -DP_INC=$(PS_INC) +# Problem size specification. +PDEF_ST_1L := -DP_BEGIN=$(PS_BEGIN_1L) -DP_MAX=$(PS_MAX_1L) -DP_INC=$(PS_INC_1L) +PDEF_ST_2L := -DP_BEGIN=$(PS_BEGIN_2L) -DP_MAX=$(PS_MAX_2L) -DP_INC=$(PS_INC_2L) +PDEF_ST_3L := -DP_BEGIN=$(PS_BEGIN_3L) -DP_MAX=$(PS_MAX_3L) -DP_INC=$(PS_INC_3L) -PDEF_MT := -DP_BEGIN=$(P1_BEGIN) \ - -DP_MAX=$(P1_MAX) \ - -DP_INC=$(P1_INC) +PDEF_MT_1L := -DP_BEGIN=$(P1_BEGIN_1L) -DP_MAX=$(P1_MAX_1L) -DP_INC=$(P1_INC_1L) +PDEF_MT_2L := -DP_BEGIN=$(P1_BEGIN_2L) -DP_MAX=$(P1_MAX_2L) -DP_INC=$(P1_INC_2L) +PDEF_MT_3L := -DP_BEGIN=$(P1_BEGIN_3L) -DP_MAX=$(P1_MAX_3L) -DP_INC=$(P1_INC_3L) ifeq ($(E),1) ERRCHK := -DERROR_CHECK @@ -241,9 +265,15 @@ SHAPES := l_l_s \ l_s_s \ l_l_l -SMS := 6 -SNS := 8 -SKS := 10 +# Define the small/constant m, n, and k dimensions for single core and multicore +# experiments. +SMS_ST := 6 +SNS_ST := 8 +SKS_ST := 4 + +SMS_MT := 6 +SNS_MT := 8 +SKS_MT := 10 # @@ -262,6 +292,15 @@ get-1of3 = $(word 1,$(subst _, ,$(1))) get-2of3 = $(word 2,$(subst _, ,$(1))) get-3of3 = $(word 3,$(subst _, ,$(1))) +# A function to return the correct PDEFS_ST variable given the shape string. +get-pdefs = $(strip $(subst l_l_l,$(PDEF_MT_3L), \ + $(subst l_l_s,$(PDEF_MT_2L), \ + $(subst l_s_l,$(PDEF_MT_2L), \ + $(subst s_l_l,$(PDEF_MT_2L), \ + $(subst s_s_l,$(PDEF_MT_1L), \ + $(subst s_l_s,$(PDEF_MT_1L), \ + $(subst l_s_s,$(PDEF_MT_1L),$(1))))))))) + # Datatype defs. get-dt-cpp = $(strip \ $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ @@ -338,29 +377,31 @@ XSTORS0 = ccc # --- Object and binary file definitons ---------------------------------------- # +# -- Single-threaded -- + get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o))))))) # Build a list of object files and binaries for each single-threaded # implementation using the get-st-objs() function defined above. -BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blissup) BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS)) -BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blislpab) BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS)) -EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),eigen) EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) -OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),openblas) OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) -BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo) +BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),blasfeo) BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS)) -LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm) +LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),libxsmm) LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS)) -VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_ST),$(SNS_ST),$(SKS_ST),vendor) VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) # Mark the object files as intermediate so that make will remove them @@ -373,23 +414,25 @@ VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) $(LIBXSMM_ST_OBJS) \ $(VENDOR_ST_OBJS) +# -- Multithreaded -- + get-mt-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_mt.o))))))) # Build a list of object files and binaries for each multithreaded # implementation using the get-st-objs() function defined above. -BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),blissup) BLISSUP_MT_BINS := $(patsubst %.o,%.x,$(BLISSUP_MT_OBJS)) -BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),blislpab) BLISLPAB_MT_BINS := $(patsubst %.o,%.x,$(BLISLPAB_MT_OBJS)) -EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),eigen) EIGEN_MT_BINS := $(patsubst %.o,%.x,$(EIGEN_MT_OBJS)) -OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),openblas) OPENBLAS_MT_BINS := $(patsubst %.o,%.x,$(OPENBLAS_MT_OBJS)) -VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_MT_OBJS := $(call get-mt-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS_MT),$(SNS_MT),$(SKS_MT),vendor) VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS)) #$(error "objs = $(EIGEN_ST_BINS)" ) @@ -404,20 +447,21 @@ VENDOR_MT_BINS := $(patsubst %.o,%.x,$(VENDOR_MT_OBJS)) # -# --- Targets/rules ------------------------------------------------------------ +# --- High-level targets/rules ------------------------------------------------- # all: st -blis: blissup-st blislpab-st +#blis: blissup-st blislpab-st +#blissup: blissup-st +#blislpab: blislpab-st +#eigen: eigen-st +#openblas: openblas-st +#blasfeo: blasfeo-st +#libxsmm: libxsmm-st +#vendor: vendor-st -blissup: blissup-st -blislpab: blislpab-st -eigen: eigen-st -openblas: openblas-st -blasfeo: blasfeo-st -libxsmm: libxsmm-st -vendor: vendor-st +# -- Single-threaded -- st: blissup-st blislpab-st \ eigen-st openblas-st blasfeo-st libxsmm-st vendor-st @@ -430,6 +474,8 @@ blasfeo-st: $(BLASFEO_ST_BINS) libxsmm-st: $(LIBXSMM_ST_BINS) vendor-st: $(VENDOR_ST_BINS) +# -- Multithreaded -- + mt: blissup-mt blislpab-mt \ eigen-mt openblas-mt vendor-mt @@ -440,14 +486,15 @@ openblas-mt: $(OPENBLAS_MT_BINS) vendor-mt: $(VENDOR_MT_BINS) - -# --Object file rules -- +# --- Object file rules -------------------------------------------------------- # Define the implementations for which we will instantiate compilation rules. BIMPLS_ST := blissup blislpab openblas blasfeo libxsmm vendor BIMPLS_MT := blissup blislpab openblas vendor EIMPLS := eigen +# -- Single-threaded BLAS -- + # 1 2 3 4 567 8 # test_dgemm_nn_rrr_mpn6kp_blissup_st.x @@ -455,7 +502,7 @@ EIMPLS := eigen # for the various single-threaded implementations. define make-st-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile - $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ endef # Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS @@ -464,17 +511,19 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ +$(foreach sm,$(SMS_ST), \ +$(foreach sn,$(SNS_ST), \ +$(foreach sk,$(SKS_ST), \ $(foreach impl,$(BIMPLS_ST), \ $(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) +# -- Multithreaded BLAS -- + # Define the function that will be used to instantiate compilation rules # for the various multithreaded implementations. define make-mt-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile - $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ endef # Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS @@ -483,17 +532,19 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ +$(foreach sm,$(SMS_MT), \ +$(foreach sn,$(SNS_MT), \ +$(foreach sk,$(SKS_MT), \ $(foreach impl,$(BIMPLS_MT), \ $(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) +# -- Single-threaded Eigen -- + # Define the function that will be used to instantiate compilation rules # for the single-threaded Eigen implementation. define make-eigst-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile - $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ + $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ endef # Instantiate the rule function make-st-rule() for each Eigen implementation. @@ -501,17 +552,19 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ +$(foreach sm,$(SMS_ST), \ +$(foreach sn,$(SNS_ST), \ +$(foreach sk,$(SKS_ST), \ $(foreach impl,$(EIMPLS), \ $(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) +# -- Multithreaded Eigen -- + # Define the function that will be used to instantiate compilation rules # for the multithreaded Eigen implementation. define make-eigmt-rule test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile - $(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(PDEF_MT) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ + $(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@ endef # Instantiate the rule function make-st-rule() for each Eigen implementation. @@ -519,20 +572,22 @@ $(foreach dt,$(DTS), \ $(foreach tr,$(TRANS), \ $(foreach st,$(STORS), \ $(foreach sh,$(SHAPES), \ -$(foreach sm,$(SMS), \ -$(foreach sn,$(SNS), \ -$(foreach sk,$(SKS), \ +$(foreach sm,$(SMS_MT), \ +$(foreach sn,$(SNS_MT), \ +$(foreach sk,$(SKS_MT), \ $(foreach impl,$(EIMPLS), \ $(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) -# -- Executable file rules -- +# --- Executable file rules ---------------------------------------------------- # NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS # on the link command line in case BLIS was configured with the BLAS # compatibility layer. This prevents BLIS from inadvertently getting called # for the BLAS routines we are trying to test with. +# -- Single-threaded -- + test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) @@ -554,6 +609,7 @@ test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK) test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) +# -- Multithreaded -- test_%_blissup_mt.x: test_%_blissup_mt.o $(LIBBLIS_LINK) $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) diff --git a/test/sup/old/supmt/octave/gen_opsupnames.m b/test/sup/old/supmt/octave/gen_opsupnames.m new file mode 100644 index 000000000..a87c06cc2 --- /dev/null +++ b/test/sup/old/supmt/octave/gen_opsupnames.m @@ -0,0 +1,55 @@ +function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims ) + +nops = size( ops, 1 ); + +smallm = smalldims( 1 ); +smalln = smalldims( 2 ); +smallk = smalldims( 3 ); + +i = 1; + +for io = 1:nops + + op = ops( io, : ); + + % NOTE: This way of sprintf'ing doesn't work when the string lengths + % vary, as they would if any of the constant dimensions is greater + % than 9. + %opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm ) + %opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln ) + %opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ) + %opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ) + %opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ) + %opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln ) + %opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor ) + + str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm ); + str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln ); + str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ); + str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ); + str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ); + str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln ); + str6 = sprintf( '%s_%s_mpnpkp', op, stor ); + + opsupnames( i+0, : ) = sprintf( '%-22s', str0 ); + opsupnames( i+1, : ) = sprintf( '%-22s', str1 ); + opsupnames( i+2, : ) = sprintf( '%-22s', str2 ); + opsupnames( i+3, : ) = sprintf( '%-22s', str3 ); + opsupnames( i+4, : ) = sprintf( '%-22s', str4 ); + opsupnames( i+5, : ) = sprintf( '%-22s', str5 ); + opsupnames( i+6, : ) = sprintf( '%-22s', str6 ); + + opnames( i+0, : ) = sprintf( '%s', op ); + opnames( i+1, : ) = sprintf( '%s', op ); + opnames( i+2, : ) = sprintf( '%s', op ); + opnames( i+3, : ) = sprintf( '%s', op ); + opnames( i+4, : ) = sprintf( '%s', op ); + opnames( i+5, : ) = sprintf( '%s', op ); + opnames( i+6, : ) = sprintf( '%s', op ); + + i = i + 7; +end + +r_val1 = opsupnames; +r_val2 = opnames; + diff --git a/test/supmt/octave/plot_l3sup_perf.m b/test/sup/old/supmt/octave/plot_l3sup_perf.m similarity index 85% rename from test/supmt/octave/plot_l3sup_perf.m rename to test/sup/old/supmt/octave/plot_l3sup_perf.m index 28056e25a..d9ecf593f 100644 --- a/test/supmt/octave/plot_l3sup_perf.m +++ b/test/sup/old/supmt/octave/plot_l3sup_perf.m @@ -9,6 +9,7 @@ function r_val = plot_l3sup_perf( opname, ... cfreq, ... dfps, ... theid, impl ) + %if ... %mod(theid-1,cols) == 2 || ... % ... %mod(theid-1,cols) == 3 || ... % ... %mod(theid-1,cols) == 4 || ... @@ -19,11 +20,11 @@ function r_val = plot_l3sup_perf( opname, ... %end %legend_plot_id = 11; -legend_plot_id = 1*cols + 1*5; +legend_plot_id = 0*cols + 1*4; if 1 -ax1 = subplot( rows, cols, theid ); -hold( ax1, 'on' ); + ax1 = subplot( rows, cols, theid ); + hold( ax1, 'on' ); end % Set line properties. @@ -77,9 +78,9 @@ end flopscol = size( data_blissup, 2 ); msize = 5; if 1 -fontsize = 11; + fontsize = 12; else -fontsize = 16; + fontsize = 16; end linesize = 0.5; legend_loc = 'southeast'; @@ -125,8 +126,7 @@ open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ... vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); -else -if theid == legend_plot_id +elseif theid == legend_plot_id blissup_ln = line( nan, nan, ... 'Color',color_blissup, 'LineStyle',lines_blissup, ... 'LineWidth',linesize ); @@ -143,7 +143,6 @@ vend_ln = line( nan, nan, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); end -end xlim( ax1, [x_begin x_end] ); @@ -188,11 +187,11 @@ if show_plot == 1 || theid == legend_plot_id set( leg,'Color','none' ); set( leg,'Units','inches' ); if impl == 'octave' - set( leg,'FontSize',fontsize ); - set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl) + set( leg,'FontSize',fontsize ); + set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl) else - set( leg,'FontSize',fontsize-1 ); - set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) end set( leg,'Box','off' ); set( leg,'Color','none' ); @@ -209,17 +208,31 @@ box( ax1, 'on' ); titl = title( titlename ); set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. +% The default is to align the plot title across whole figure, not the box. +% This is a hack to nudge the title back to the center of the box. if impl == 'octave' -tpos = get( titl, 'Position' ); % default is to align across whole figure, not box. -tpos(1) = tpos(1) + -40; -set( titl, 'Position', tpos ); % here we nudge it back to centered with box. + tpos = get( titl, 'Position' ); + % For some reason, the titles in the graphs in the last column start + % off in a different relative position than the graphs in the other + % columns. Here, we manually account for that. + if mod(theid-1,cols) == 6 + tpos(1) = tpos(1) + -10; + else + tpos(1) = tpos(1) + -40; + end + set( titl, 'Position', tpos ); + set( titl, 'FontSize', fontsize ); +else % impl == 'matlab' + tpos = get( titl, 'Position' ); + tpos(1) = tpos(1) + 90; + set( titl, 'Position', tpos ); end if theid > (rows-1)*cols -%xlab = xlabel( ax1,xaxisname ); -%tpos = get( xlab, 'Position' ) -%tpos(2) = tpos(2) + 10; -%set( xlab, 'Position', tpos ); + %xlab = xlabel( ax1,xaxisname ); + %tpos = get( xlab, 'Position' ) + %tpos(2) = tpos(2) + 10; + %set( xlab, 'Position', tpos ); if theid == rows*cols - 6 xlab = xlabel( ax1, 'm = 6; n = k' ); elseif theid == rows*cols - 5 @@ -238,13 +251,8 @@ if theid > (rows-1)*cols end if mod(theid-1,cols) == 0 -ylab = ylabel( ax1,yaxisname ); + ylab = ylabel( ax1,yaxisname ); end -%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' ); -%saveas( fig, filename_png ); - -%hold( ax1, 'off' ); - r_val = 0; diff --git a/test/sup/old/supmt/octave/plot_panel_trxsh.m b/test/sup/old/supmt/octave/plot_panel_trxsh.m new file mode 100644 index 000000000..b9fac8ff9 --- /dev/null +++ b/test/sup/old/supmt/octave/plot_panel_trxsh.m @@ -0,0 +1,152 @@ +function r_val = plot_panel_trxsh ... + ( ... + cfreq, ... + dflopspercycle, ... + nth, ... + thr_str, ... + dt_ch, ... + stor_str, ... + smalldims, ... + dirpath, ... + arch_str, ... + vend_str, ... + impl ... + ) + +%cfreq = 1.8; +%dflopspercycle = 32; + +% Create filename "templates" for the files that contain the performance +% results. +filetemp_blissup = '%s/output_%s_%s_blissup.m'; +filetemp_blislpab = '%s/output_%s_%s_blislpab.m'; +filetemp_eigen = '%s/output_%s_%s_eigen.m'; +filetemp_open = '%s/output_%s_%s_openblas.m'; +filetemp_vend = '%s/output_%s_%s_vendor.m'; + +% Create a variable name "template" for the variables contained in the +% files outlined above. +vartemp = 'data_%s_%s_%s( :, : )'; + +% Define the datatypes and operations we will be plotting. +oproot = sprintf( '%cgemm', dt_ch ); +ops( 1, : ) = sprintf( '%s_nn', oproot ); +ops( 2, : ) = sprintf( '%s_nt', oproot ); +ops( 3, : ) = sprintf( '%s_tn', oproot ); +ops( 4, : ) = sprintf( '%s_tt', oproot ); + +% Generate datatype-specific operation names from the set of operations +% and datatypes. +[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims ); +n_opsupnames = size( opsupnames, 1 ); + +%opsupnames +%opnames +%return + +if 1 == 1 + %fig = figure('Position', [100, 100, 2400, 1500]); + fig = figure('Position', [100, 100, 2400, 1200]); + orient( fig, 'portrait' ); + set(gcf,'PaperUnits', 'inches'); + if impl == 'matlab' + set(gcf,'PaperSize', [11.5 20.4]); + set(gcf,'PaperPosition', [0 0 11.5 20.4]); + set(gcf,'PaperPositionMode','manual'); + else % impl == 'octave' % octave 4.x + set(gcf,'PaperSize', [12 22.0]); + set(gcf,'PaperPositionMode','auto'); + end + set(gcf,'PaperOrientation','landscape'); +end + + +% Iterate over the list of datatype-specific operation names. +for opi = 1:n_opsupnames +%for opi = 1:1 + + % Grab the current datatype combination. + opsupname = opsupnames( opi, : ); + opname = opnames( opi, : ); + + opsupname = strtrim( opsupname ); + opname = strtrim( opname ); + + str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str); + + % Construct filenames for the data files from templates. + file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname ); + file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname ); + file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname ); + file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname ); + file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname ); + + % Load the data files. + %str = sprintf( ' Loading %s', file_blissup ); disp(str); + run( file_blissup ) + run( file_blislpab ) + run( file_eigen ) + run( file_open ) + run( file_vend ) + + % Construct variable names for the variables in the data files. + var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' ); + var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' ); + var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' ); + var_open = sprintf( vartemp, thr_str, opname, 'openblas' ); + var_vend = sprintf( vartemp, thr_str, opname, 'vendor' ); + + % Use eval() to instantiate the variable names constructed above, + % copying each to a simplified name. + data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : ); + data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : ); + data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : ); + data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : ); + data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : ); + + %str = sprintf( ' Reading %s', var_blissup ); disp(str); + %str = sprintf( ' Reading %s', var_blislpab ); disp(str); + %str = sprintf( ' Reading %s', var_eigen ); disp(str); + %str = sprintf( ' Reading %s', var_open ); disp(str); + %str = sprintf( ' Reading %s', var_bfeo ); disp(str); + %str = sprintf( ' Reading %s', var_xsmm ); disp(str); + %str = sprintf( ' Reading %s', var_vend ); disp(str); + + % Plot one result in an m x n grid of plots, via the subplot() + % function. + if 1 == 1 + plot_l3sup_perf( opsupname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_vend, vend_str, ... + nth, ... + 4, 7, ... + cfreq, ... + dflopspercycle, ... + opi, impl ); + + clear data_mt_*gemm_*; + clear data_blissup; + clear data_blislpab; + clear data_eigen; + clear data_open; + clear data_vend; + + end + +end + +% Construct the name of the file to which we will output the graph. +outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth ); + +% Output the graph to pdf format. +%print(gcf, 'gemm_md','-fillpage','-dpdf'); +%print(gcf, outfile,'-bestfit','-dpdf'); +if impl == 'octave' + print(gcf, outfile); +else % if impl == 'matlab' + print(gcf, outfile,'-bestfit','-dpdf'); +end + diff --git a/test/sup/old/supmt/octave/runthese.m b/test/sup/old/supmt/octave/runthese.m new file mode 100644 index 000000000..e11f8b173 --- /dev/null +++ b/test/sup/old/supmt/octave/runthese.m @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + +% kabylake +plot_panel_trxsh(3.80,16,4,'mt','d','rrr',[ 6 8 10 ],'../../sup/results/kabylake/20200302/mnkt100000_mt4','kbl','MKL','octave'); close; clear all; + +% haswell +plot_panel_trxsh(3.1,16,12,'mt','d','rrr',[ 6 8 10 ],'../../sup/results/haswell/20200302/mnkt100000_mt12','has','MKL','octave'); close; clear all; + +% epyc +plot_panel_trxsh(2.55,8,32,'mt','d','rrr',[ 6 8 10 ],'../../sup/results/epyc/20200302/mnkt100000_mt32','epyc','MKL','octave'); close; clear all; diff --git a/test/supmt/runme.sh b/test/sup/old/supmt/runme.sh similarity index 77% rename from test/supmt/runme.sh rename to test/sup/old/supmt/runme.sh index e878d76b0..911fbbaa4 100755 --- a/test/supmt/runme.sh +++ b/test/sup/old/supmt/runme.sh @@ -17,19 +17,19 @@ if [ ${sys} = "blis" ]; then elif [ ${sys} = "lonestar5" ]; then export GOMP_CPU_AFFINITY="0-23" - nt=24 + nt=12 elif [ ${sys} = "ul252" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" export GOMP_CPU_AFFINITY="0-51" - nt=52 + nt=26 elif [ ${sys} = "ul264" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" export GOMP_CPU_AFFINITY="0-63" - nt=64 + nt=32 fi @@ -37,7 +37,8 @@ fi delay=0.02 # Threadedness to test. -threads="mt" +#threads="st mt" +threads="st mt" # Datatypes to test. #dts="d s" @@ -96,17 +97,32 @@ for th in ${threads}; do for im in ${impls}; do - if [ "${im:0:4}" = "blis" ]; then - unset OMP_NUM_THREADS - export BLIS_NUM_THREADS=${nt} - elif [ "${im}" = "openblas" ]; then - unset OMP_NUM_THREADS - export OPENBLAS_NUM_THREADS=${nt} - elif [ "${im}" = "eigen" ]; then - export OMP_NUM_THREADS=${nt} - elif [ "${im}" = "vendor" ]; then - unset OMP_NUM_THREADS - export MKL_NUM_THREADS=${nt} + if [ "${th}" = "mt" ]; then + + # Specify the multithreading depending on which + # implementation is about to be tested. + if [ "${im:0:4}" = "blis" ]; then + unset OMP_NUM_THREADS + export BLIS_NUM_THREADS=${nt} + elif [ "${im}" = "openblas" ]; then + unset OMP_NUM_THREADS + export OPENBLAS_NUM_THREADS=${nt} + elif [ "${im}" = "eigen" ]; then + export OMP_NUM_THREADS=${nt} + elif [ "${im}" = "vendor" ]; then + unset OMP_NUM_THREADS + export MKL_NUM_THREADS=${nt} + fi + export nt_use=${nt} + + else # if [ "${th}" = "st" ]; + + # Use single-threaded execution. + export OMP_NUM_THREADS=1 + export BLIS_NUM_THREADS=1 + export OPENBLAS_NUM_THREADS=1 + export MKL_NUM_THREADS=1 + export nt_use=1 fi # Multithreaded OpenBLAS seems to have a problem @@ -168,7 +184,7 @@ for th in ${threads}; do # Construct the name of the output file. out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m" - echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}" + echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" # Run executable. ./${exec_name} > ${out_file} diff --git a/test/sup/old/supmt/test_gemm.c b/test/sup/old/supmt/test_gemm.c new file mode 100644 index 000000000..23cc56440 --- /dev/null +++ b/test/sup/old/supmt/test_gemm.c @@ -0,0 +1,597 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#ifdef EIGEN + #define BLIS_DISABLE_BLAS_DEFS + #include "blis.h" + #include + //#include + using namespace Eigen; +#else + #include "blis.h" +#endif + +//#define PRINT + +int main( int argc, char** argv ) +{ + rntm_t rntm_g; + + bli_init(); + + // Copy the global rntm_t object so that we can use it later when disabling + // sup. Starting with a copy of the global rntm_t is actually necessary; + // if we start off with a locally-initialized rntm_t, it will not contain + // the ways of parallelism that were conveyed via environment variables, + // which is necessary when running this driver with multiple BLIS threads. + bli_rntm_init_from_global( &rntm_g ); + +#ifndef ERROR_CHECK + bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); +#endif + + + dim_t n_trials = N_TRIALS; + + num_t dt = DT; + +#if 1 + dim_t p_begin = P_BEGIN; + dim_t p_max = P_MAX; + dim_t p_inc = P_INC; +#else + dim_t p_begin = 4; + dim_t p_max = 40; + dim_t p_inc = 4; +#endif + +#if 1 + dim_t m_input = M_DIM; + dim_t n_input = N_DIM; + dim_t k_input = K_DIM; +#else + p_begin = p_inc = 32; + dim_t m_input = 6; + dim_t n_input = -1; + dim_t k_input = -1; +#endif + +#if 1 + trans_t transa = TRANSA; + trans_t transb = TRANSB; +#else + trans_t transa = BLIS_NO_TRANSPOSE; + trans_t transb = BLIS_NO_TRANSPOSE; +#endif + +#if 1 + stor3_t sc = STOR3; +#else + stor3_t sc = BLIS_RRR; +#endif + + + inc_t rs_c, cs_c; + inc_t rs_a, cs_a; + inc_t rs_b, cs_b; + + if ( sc == BLIS_RRR ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = -1; } + else if ( sc == BLIS_RRC ) { rs_c = cs_c = -1; rs_a = cs_a = -1; rs_b = cs_b = 0; } + else if ( sc == BLIS_RCR ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = -1; } + else if ( sc == BLIS_RCC ) { rs_c = cs_c = -1; rs_a = cs_a = 0; rs_b = cs_b = 0; } + else if ( sc == BLIS_CRR ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = -1; } + else if ( sc == BLIS_CRC ) { rs_c = cs_c = 0; rs_a = cs_a = -1; rs_b = cs_b = 0; } + else if ( sc == BLIS_CCR ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = -1; } + else if ( sc == BLIS_CCC ) { rs_c = cs_c = 0; rs_a = cs_a = 0; rs_b = cs_b = 0; } + else { bli_abort(); } + + f77_int cbla_storage; + + if ( sc == BLIS_RRR ) cbla_storage = CblasRowMajor; + else if ( sc == BLIS_CCC ) cbla_storage = CblasColMajor; + else cbla_storage = -1; + + ( void )cbla_storage; + + + char dt_ch; + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + f77_char f77_transa; + f77_char f77_transb; + char transal, transbl; + + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + transal = tolower( f77_transa ); + transbl = tolower( f77_transb ); + + f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans ); + f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans ); + + ( void )cbla_transa; + ( void )cbla_transb; + + dim_t p; + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; + + printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, + transal, transbl, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) + { + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + bli_obj_create( dt, m, n, rs_c, cs_c, &c ); + bli_obj_create( dt, m, n, rs_c, cs_c, &c_save ); + + if ( bli_does_notrans( transa ) ) + bli_obj_create( dt, m, k, rs_a, cs_a, &a ); + else + bli_obj_create( dt, k, m, rs_a, cs_a, &a ); + + if ( bli_does_notrans( transb ) ) + bli_obj_create( dt, k, n, rs_b, cs_b, &b ); + else + bli_obj_create( dt, n, k, rs_b, cs_b, &b ); + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_setsc( (1.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + bli_copym( &c, &c_save ); + +#ifdef EIGEN + double alpha_r, alpha_i; + + bli_getsc( &alpha, &alpha_r, &alpha_i ); + + void* ap = bli_obj_buffer_at_off( &a ); + void* bp = bli_obj_buffer_at_off( &b ); + void* cp = bli_obj_buffer_at_off( &c ); + + const int os_a = ( bli_obj_is_col_stored( &a ) ? bli_obj_col_stride( &a ) + : bli_obj_row_stride( &a ) ); + const int os_b = ( bli_obj_is_col_stored( &b ) ? bli_obj_col_stride( &b ) + : bli_obj_row_stride( &b ) ); + const int os_c = ( bli_obj_is_col_stored( &c ) ? bli_obj_col_stride( &c ) + : bli_obj_row_stride( &c ) ); + + Stride stride_a( os_a, 1 ); + Stride stride_b( os_b, 1 ); + Stride stride_c( os_c, 1 ); + + #if defined(IS_FLOAT) + #elif defined (IS_DOUBLE) + #ifdef A_STOR_R + typedef Matrix MatrixXd_A; + #else + typedef Matrix MatrixXd_A; + #endif + #ifdef B_STOR_R + typedef Matrix MatrixXd_B; + #else + typedef Matrix MatrixXd_B; + #endif + #ifdef C_STOR_R + typedef Matrix MatrixXd_C; + #else + typedef Matrix MatrixXd_C; + #endif + + #ifdef A_NOTRANS // A is not transposed + Map > A( ( double* )ap, m, k, stride_a ); + #else // A is transposed + Map > A( ( double* )ap, k, m, stride_a ); + #endif + + #ifdef B_NOTRANS // B is not transposed + Map > B( ( double* )bp, k, n, stride_b ); + #else // B is transposed + Map > B( ( double* )bp, n, k, stride_b ); + #endif + + Map > C( ( double* )cp, m, n, stride_c ); + #endif +#endif + + + double dtime_save = DBL_MAX; + + for ( dim_t r = 0; r < n_trials; ++r ) + { + bli_copym( &c_save, &c ); + + + double dtime = bli_clock(); + + +#ifdef EIGEN + + #ifdef A_NOTRANS + #ifdef B_NOTRANS + C.noalias() += alpha_r * A * B; + #else // B_TRANS + C.noalias() += alpha_r * A * B.transpose(); + #endif + #else // A_TRANS + #ifdef B_NOTRANS + C.noalias() += alpha_r * A.transpose() * B; + #else // B_TRANS + C.noalias() += alpha_r * A.transpose() * B.transpose(); + #endif + #endif + +#endif +#ifdef BLIS + #ifdef SUP + // Allow sup. + bli_gemm( &alpha, + &a, + &b, + &beta, + &c ); + #else + // NOTE: We can't use the static initializer and must instead + // initialize the rntm_t with the copy from the global rntm_t we + // made at the beginning of main(). Please see the comment there + // for more info on why BLIS_RNTM_INITIALIZER doesn't work here. + //rntm_t rntm = BLIS_RNTM_INITIALIZER; + rntm_t rntm = rntm_g; + + // Disable sup and use the expert interface. + bli_rntm_disable_l3_sup( &rntm ); + + bli_gemm_ex( &alpha, + &a, + &b, + &beta, + &c, NULL, &rntm ); + #endif +#endif +#ifdef BLAS + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = ( float* )bli_obj_buffer( &alpha ); + float* ap = ( float* )bli_obj_buffer( &a ); + float* bp = ( float* )bli_obj_buffer( &b ); + float* betap = ( float* )bli_obj_buffer( &beta ); + float* cp = ( float* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_sgemm( &f77_transa, + #else + sgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = ( double* )bli_obj_buffer( &alpha ); + double* ap = ( double* )bli_obj_buffer( &a ); + double* bp = ( double* )bli_obj_buffer( &b ); + double* betap = ( double* )bli_obj_buffer( &beta ); + double* cp = ( double* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_dgemm( &f77_transa, + #else + dgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); + scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); + scomplex* bp = ( scomplex* )bli_obj_buffer( &b ); + scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); + scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_cgemm( &f77_transa, + #else + cgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); + dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); + dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b ); + dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); + dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); + + #ifdef XSMM + libxsmm_zgemm( &f77_transa, + #else + zgemm_( &f77_transa, + #endif + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif +#ifdef CBLAS + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + float* alphap = bli_obj_buffer( &alpha ); + float* ap = bli_obj_buffer( &a ); + float* bp = bli_obj_buffer( &b ); + float* betap = bli_obj_buffer( &beta ); + float* cp = bli_obj_buffer( &c ); + + cblas_sgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + *alphap, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + double* alphap = bli_obj_buffer( &alpha ); + double* ap = bli_obj_buffer( &a ); + double* bp = bli_obj_buffer( &b ); + double* betap = bli_obj_buffer( &beta ); + double* cp = bli_obj_buffer( &c ); + + cblas_dgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + *alphap, + ap, lda, + bp, ldb, + *betap, + cp, ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + scomplex* alphap = bli_obj_buffer( &alpha ); + scomplex* ap = bli_obj_buffer( &a ); + scomplex* bp = bli_obj_buffer( &b ); + scomplex* betap = bli_obj_buffer( &beta ); + scomplex* cp = bli_obj_buffer( &c ); + + cblas_cgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + alphap, + ap, lda, + bp, ldb, + betap, + cp, ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + #ifdef C_STOR_R + f77_int lda = bli_obj_row_stride( &a ); + f77_int ldb = bli_obj_row_stride( &b ); + f77_int ldc = bli_obj_row_stride( &c ); + #else + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + #endif + dcomplex* alphap = bli_obj_buffer( &alpha ); + dcomplex* ap = bli_obj_buffer( &a ); + dcomplex* bp = bli_obj_buffer( &b ); + dcomplex* betap = bli_obj_buffer( &beta ); + dcomplex* cp = bli_obj_buffer( &c ); + + cblas_zgemm( cbla_storage, + cbla_transa, + cbla_transb, + mm, + nn, + kk, + alphap, + ap, lda, + bp, ldb, + betap, + cp, ldc ); + } +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + double gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + + printf( "data_%s_%cgemm_%c%c_%s", THR_STR, dt_ch, + transal, transbl, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )n, + ( unsigned long )k, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/sup/old/supst/Makefile b/test/sup/old/supst/Makefile new file mode 100644 index 000000000..6ab97b06f --- /dev/null +++ b/test/sup/old/supst/Makefile @@ -0,0 +1,496 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2019, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all all-st all-mt \ + blis blis-st blis-mt \ + clean cleanx + + + +# +# --- Determine makefile fragment location ------------------------------------- +# + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +ifneq ($(strip $(BLIS_INSTALL_PATH)),) +LIB_PATH := $(BLIS_INSTALL_PATH)/lib +INC_PATH := $(BLIS_INSTALL_PATH)/include/blis +SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis +else +DIST_PATH := ../.. +LIB_PATH = ../../lib/$(CONFIG_NAME) +INC_PATH = ../../include/$(CONFIG_NAME) +SHARE_PATH := ../.. +endif + + + +# +# --- Include common makefile definitions -------------------------------------- +# + +# Include the common makefile fragment. +-include $(SHARE_PATH)/common.mk + + + +# +# --- BLAS and LAPACK implementations ------------------------------------------ +# + +# BLIS library and header path. This is simply wherever it was installed. +#BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib +#BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis + +# BLIS library. +#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a + +# BLAS library path(s). This is where the BLAS libraries reside. +HOME_LIB_PATH := $(HOME)/flame/lib +MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 + +# netlib BLAS +NETLIB_LIB := $(HOME_LIB_PATH)/libblas.a + +# OpenBLAS +OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a +OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a + +# BLASFEO +BLASFEO_LIB := $(HOME_LIB_PATH)/libblasfeo.a + +# libxsmm +LIBXSMM_LIB := $(HOME_LIB_PATH)/libxsmm.a -ldl \ + $(NETLIB_LIB) -lgfortran + +# ATLAS +ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ + $(HOME_LIB_PATH)/libatlas.a + +# Eigen +EIGEN_INC := $(HOME)/flame/eigen/include/eigen3 +EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a +EIGENP_LIB := $(EIGEN_LIB) + +# MKL +MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_sequential \ + -lpthread -lm -ldl +MKLP_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_gnu_thread \ + -lpthread -lm -ldl -fopenmp + #-L$(ICC_LIB_PATH) \ + #-lgomp + +VENDOR_LIB := $(MKL_LIB) +VENDORP_LIB := $(MKLP_LIB) + + +# +# --- Problem size definitions ------------------------------------------------- +# + +# Single core +PS_BEGIN_3L := 2 +PS_MAX_3L := 400 +PS_INC_3L := 2 + +PS_BEGIN_2L := 4 +PS_MAX_2L := 800 +PS_INC_2L := 4 + +PS_BEGIN_1L := 32 +PS_MAX_1L := 6400 +PS_INC_1L := 32 + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c))) + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-frame-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +CINCFLAGS := -I$(INC_PATH) + +# Use the "framework" CFLAGS for the configuration family. +CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) + +# Add local header paths to CFLAGS. +CFLAGS += -I$(TEST_SRC_PATH) + +# Locate the libblis library to which we will link. +LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) + +# Define a set of CFLAGS for use with C++ and Eigen. +CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS)) +CXXFLAGS += -I$(EIGEN_INC) + +# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading. +CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) +CXXFLAGS_MT := -march=native $(CXXFLAGS) + +# Single or multithreaded string. +STR_ST := -DTHR_STR=\"st\" +STR_MT := -DTHR_STR=\"mt\" + +# Number of trials per problem size. +N_TRIALS := -DN_TRIALS=3 + +# Problem size specification. +PDEF_ST_1L := -DP_BEGIN=$(PS_BEGIN_1L) -DP_MAX=$(PS_MAX_1L) -DP_INC=$(PS_INC_1L) +PDEF_ST_2L := -DP_BEGIN=$(PS_BEGIN_2L) -DP_MAX=$(PS_MAX_2L) -DP_INC=$(PS_INC_2L) +PDEF_ST_3L := -DP_BEGIN=$(PS_BEGIN_3L) -DP_MAX=$(PS_MAX_3L) -DP_INC=$(PS_INC_3L) + +ifeq ($(E),1) +ERRCHK := -DERROR_CHECK +else +ERRCHK := -DNO_ERROR_CHECK +endif + +# Enumerate possible datatypes and computation precisions. +#dts := s d c z +DTS := d + +TRANS := n_n \ + n_t \ + t_n \ + t_t + +# While BLIS supports all combinations of row and column storage for matrices +# C, A, and B, the alternatives mostly only support CBLAS APIs, which inherently +# support only "all row-storage" or "all column-storage". Thus, we disable the +# building of those other drivers so that compilation/linking completes sooner. +#STORS := r_r_r \ +# r_r_c \ +# r_c_r \ +# r_c_c \ +# c_r_r \ +# c_r_c \ +# c_c_r \ +# c_c_c +STORS := r_r_r \ + c_c_c + + +SHAPES := l_l_s \ + l_s_l \ + s_l_l \ + s_s_l \ + s_l_s \ + l_s_s \ + l_l_l + +SMS := 6 +SNS := 8 +SKS := 4 + + +# +# --- Function definitions ----------------------------------------------------- +# + +# A function to strip the underscores from a list of strings. +stripu = $(subst _,,$(1)) + +# Various functions that help us construct the datatype combinations and then +# extract the needed datatype strings and C preprocessor define flags. +get-1of2 = $(word 1,$(subst _, ,$(1))) +get-2of2 = $(word 2,$(subst _, ,$(1))) + +get-1of3 = $(word 1,$(subst _, ,$(1))) +get-2of3 = $(word 2,$(subst _, ,$(1))) +get-3of3 = $(word 3,$(subst _, ,$(1))) + +# A function to return the correct PDEFS_ST variable given the shape string. +get-pdefs = $(strip $(subst l_l_l,$(PDEF_ST_3L), \ + $(subst l_l_s,$(PDEF_ST_2L), \ + $(subst l_s_l,$(PDEF_ST_2L), \ + $(subst s_l_l,$(PDEF_ST_2L), \ + $(subst s_s_l,$(PDEF_ST_1L), \ + $(subst s_l_s,$(PDEF_ST_1L), \ + $(subst l_s_s,$(PDEF_ST_1L),$(1))))))))) + +# Datatype defs. +get-dt-cpp = $(strip \ + $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ + $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\ + $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\ + -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX)))) + +# Transpose defs. +get-tra-defs-a = $(strip $(subst n,-DTRANSA=BLIS_NO_TRANSPOSE -DA_NOTRANS, \ + $(subst t,-DTRANSA=BLIS_TRANSPOSE -DA_TRANS,$(call get-1of2,$(1))))) +get-tra-defs-b = $(strip $(subst n,-DTRANSB=BLIS_NO_TRANSPOSE -DB_NOTRANS, \ + $(subst t,-DTRANSB=BLIS_TRANSPOSE -DB_TRANS,$(call get-2of2,$(1))))) +get-tra-defs = $(call get-tra-defs-a,$(1)) $(call get-tra-defs-b,$(1)) + +# Storage defs. +get-sto-uch-a = $(strip $(subst r,R, \ + $(subst c,C,$(call get-1of3,$(1))))) +get-sto-uch-b = $(strip $(subst r,R, \ + $(subst c,C,$(call get-2of3,$(1))))) +get-sto-uch-c = $(strip $(subst r,R, \ + $(subst c,C,$(call get-3of3,$(1))))) +get-sto-defs = $(strip \ + -DSTOR3=BLIS_$(call get-sto-uch-a,$(1))$(call get-sto-uch-b,$(1))$(call get-sto-uch-c,$(1)) \ + -DA_STOR_$(call get-sto-uch-a,$(1)) \ + -DB_STOR_$(call get-sto-uch-b,$(1)) \ + -DC_STOR_$(call get-sto-uch-c,$(1))) + +# Dimension defs. +get-shape-defs-cm = $(if $(findstring l,$(1)),-DM_DIM=-1,-DM_DIM=$(2)) +get-shape-defs-cn = $(if $(findstring l,$(1)),-DN_DIM=-1,-DN_DIM=$(2)) +get-shape-defs-ck = $(if $(findstring l,$(1)),-DK_DIM=-1,-DK_DIM=$(2)) +get-shape-defs-m = $(call get-shape-defs-cm,$(call get-1of3,$(1)),$(2)) +get-shape-defs-n = $(call get-shape-defs-cn,$(call get-2of3,$(1)),$(2)) +get-shape-defs-k = $(call get-shape-defs-ck,$(call get-3of3,$(1)),$(2)) + +# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk +get-shape-defs = $(strip $(call get-shape-defs-m,$(1),$(2)) \ + $(call get-shape-defs-n,$(1),$(3)) \ + $(call get-shape-defs-k,$(1),$(4))) + +#$(error l_l_s 6 8 4 = $(call get-shape-defs,l_l_s,6,8,4)) + +# Shape-dimension string. +get-shape-str-ch = $(if $(findstring l,$(1)),p,$(2)) +get-shape-str-m = $(call get-shape-str-ch,$(call get-1of3,$(1)),$(2)) +get-shape-str-n = $(call get-shape-str-ch,$(call get-2of3,$(1)),$(2)) +get-shape-str-k = $(call get-shape-str-ch,$(call get-3of3,$(1)),$(2)) + +# arguments: 1: shape (w/ underscores) 2: smallm 3: smalln 4: smallk +get-shape-dim-str = m$(call get-shape-str-m,$(1),$(2))n$(call get-shape-str-n,$(1),$(3))k$(call get-shape-str-k,$(1),$(4)) + +# Implementation defs. +# Define a function to return the appropriate -DSTR= and -D[BLIS|BLAS] flags. +get-imp-defs = $(strip $(subst blissup,-DSTR=\"$(1)\" -DBLIS -DSUP, \ + $(subst blislpab,-DSTR=\"$(1)\" -DBLIS, \ + $(subst eigen,-DSTR=\"$(1)\" -DEIGEN, \ + $(subst openblas,-DSTR=\"$(1)\" -DCBLAS, \ + $(subst blasfeo,-DSTR=\"$(1)\" -DCBLAS, \ + $(subst libxsmm,-DSTR=\"$(1)\" -DBLAS -DXSMM, \ + $(subst vendor,-DSTR=\"$(1)\" -DCBLAS,$(1))))))))) + +TRANS0 = $(call stripu,$(TRANS)) +STORS0 = $(call stripu,$(STORS)) + +# Limit BLAS and Eigen to only using all row-stored, or all column-stored matrices. +# Also, limit libxsmm to using all column-stored matrices since it does not offer +# CBLAS interfaces. +BSTORS0 = rrr ccc +ESTORS0 = rrr ccc +XSTORS0 = ccc + + +# +# --- Object and binary file definitons ---------------------------------------- +# + +get-st-objs = $(foreach dt,$(1),$(foreach tr,$(2),$(foreach st,$(3),$(foreach sh,$(4),$(foreach sm,$(5),$(foreach sn,$(6),$(foreach sk,$(7),test_$(dt)gemm_$(tr)_$(st)_$(call get-shape-dim-str,$(sh),$(sm),$(sn),$(sk))_$(8)_st.o))))))) + +# Build a list of object files and binaries for each single-threaded +# implementation using the get-st-objs() function defined above. +BLISSUP_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blissup) +BLISSUP_ST_BINS := $(patsubst %.o,%.x,$(BLISSUP_ST_OBJS)) + +BLISLPAB_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(STORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blislpab) +BLISLPAB_ST_BINS := $(patsubst %.o,%.x,$(BLISLPAB_ST_OBJS)) + +EIGEN_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(ESTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),eigen) +EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) + +OPENBLAS_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),openblas) +OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) + +BLASFEO_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),blasfeo) +BLASFEO_ST_BINS := $(patsubst %.o,%.x,$(BLASFEO_ST_OBJS)) + +LIBXSMM_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(XSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),libxsmm) +LIBXSMM_ST_BINS := $(patsubst %.o,%.x,$(LIBXSMM_ST_OBJS)) + +VENDOR_ST_OBJS := $(call get-st-objs,$(DTS),$(TRANS0),$(BSTORS0),$(SHAPES),$(SMS),$(SNS),$(SKS),vendor) +VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) + +#$(error "objs = $(EIGEN_ST_BINS)" ) + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLISSUP_ST_OBJS) \ + $(BLISLPAB_ST_OBJS) \ + $(EIGEN_ST_OBJS) \ + $(OPENBLAS_ST_OBJS) \ + $(BLASFEO_ST_OBJS) \ + $(LIBXSMM_ST_OBJS) \ + $(VENDOR_ST_OBJS) + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +all: st + +blissup: blissup-st +blislpab: blislpab-st +eigen: eigen-st +openblas: openblas-st +blasfeo: blasfeo-st +libxsmm: libxsmm-st +vendor: vendor-st + +st: blissup-st blislpab-st \ + eigen-st openblas-st blasfeo-st libxsmm-st vendor-st +blis: blissup-st blislpab-st + +blissup-st: $(BLISSUP_ST_BINS) +blislpab-st: $(BLISLPAB_ST_BINS) +eigen-st: $(EIGEN_ST_BINS) +openblas-st: $(OPENBLAS_ST_BINS) +blasfeo-st: $(BLASFEO_ST_BINS) +libxsmm-st: $(LIBXSMM_ST_BINS) +vendor-st: $(VENDOR_ST_BINS) + + +# --Object file rules -- + +# Define the implementations for which we will instantiate compilation rules. +BIMPLS := blissup blislpab openblas blasfeo libxsmm vendor +EIMPLS := eigen + +# 1 2 3 4 567 8 +# test_dgemm_nn_rrr_mpn6kp_blissup_st.x + +# Define the function that will be used to instantiate compilation rules +# for the various implementations. +define make-st-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile + $(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each BLIS/BLAS/CBLAS +# implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(BIMPLS), \ +$(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + +# Define the function that will be used to instantiate compilation rules +# for the various implementations. +define make-eigst-rule +test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_st.o: test_gemm.c Makefile + $(CXX) $(CXXFLAGS_ST) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_ST) -c $$< -o $$@ +endef + +# Instantiate the rule function make-st-rule() for each Eigen implementation. +$(foreach dt,$(DTS), \ +$(foreach tr,$(TRANS), \ +$(foreach st,$(STORS), \ +$(foreach sh,$(SHAPES), \ +$(foreach sm,$(SMS), \ +$(foreach sn,$(SNS), \ +$(foreach sk,$(SKS), \ +$(foreach impl,$(EIMPLS), \ +$(eval $(call make-eigst-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl))))))))))) + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_blissup_st.x: test_%_blissup_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blislpab_st.x: test_%_blislpab_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_eigen_st.x: test_%_eigen_st.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_blasfeo_st.x: test_%_blasfeo_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(BLASFEO_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_libxsmm_st.x: test_%_libxsmm_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBXSMM_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_vendor_st.x: test_%_vendor_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.x *.o + diff --git a/test/sup/old/supst/octave/gen_opsupnames.m b/test/sup/old/supst/octave/gen_opsupnames.m new file mode 100644 index 000000000..b70c8a12a --- /dev/null +++ b/test/sup/old/supst/octave/gen_opsupnames.m @@ -0,0 +1,36 @@ +function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims ) + +nops = size( ops, 1 ); + +smallm = smalldims( 1 ); +smalln = smalldims( 2 ); +smallk = smalldims( 3 ); + +i = 1; + +for io = 1:nops + + op = ops( io, : ); + + opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp', op, stor, smallm ); + opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp', op, stor, smalln ); + opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk ); + opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk ); + opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk ); + opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln ); + opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp', op, stor ); + + opnames( i+0, : ) = sprintf( '%s', op ); + opnames( i+1, : ) = sprintf( '%s', op ); + opnames( i+2, : ) = sprintf( '%s', op ); + opnames( i+3, : ) = sprintf( '%s', op ); + opnames( i+4, : ) = sprintf( '%s', op ); + opnames( i+5, : ) = sprintf( '%s', op ); + opnames( i+6, : ) = sprintf( '%s', op ); + + i = i + 7; +end + +r_val1 = opsupnames; +r_val2 = opnames; + diff --git a/test/sup/octave/plot_l3sup_perf.m b/test/sup/old/supst/octave/plot_l3sup_perf.m similarity index 86% rename from test/sup/octave/plot_l3sup_perf.m rename to test/sup/old/supst/octave/plot_l3sup_perf.m index bf2910878..ebc5d3000 100644 --- a/test/sup/octave/plot_l3sup_perf.m +++ b/test/sup/old/supst/octave/plot_l3sup_perf.m @@ -11,6 +11,7 @@ function r_val = plot_l3sup_perf( opname, ... cfreq, ... dfps, ... theid, impl ) + %if ... %mod(theid-1,cols) == 2 || ... % ... %mod(theid-1,cols) == 3 || ... % ... %mod(theid-1,cols) == 4 || ... @@ -21,11 +22,11 @@ function r_val = plot_l3sup_perf( opname, ... %end %legend_plot_id = 11; -legend_plot_id = 1*cols + 1*5; +legend_plot_id = 2*cols + 1*5; if 1 -ax1 = subplot( rows, cols, theid ); -hold( ax1, 'on' ); + ax1 = subplot( rows, cols, theid ); + hold( ax1, 'on' ); end % Set line properties. @@ -83,9 +84,9 @@ end flopscol = size( data_blissup, 2 ); msize = 5; if 1 -fontsize = 11; + fontsize = 12; else -fontsize = 16; + fontsize = 16; end linesize = 0.5; legend_loc = 'southeast'; @@ -148,8 +149,7 @@ end vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); -else -if theid == legend_plot_id +elseif theid == legend_plot_id blissup_ln = line( nan, nan, ... 'Color',color_blissup, 'LineStyle',lines_blissup, ... 'LineWidth',linesize ); @@ -172,7 +172,6 @@ vend_ln = line( nan, nan, ... 'Color',color_vend, 'LineStyle',lines_vend, ... 'LineWidth',linesize ); end -end xlim( ax1, [x_begin x_end] ); @@ -222,11 +221,11 @@ if show_plot == 1 || theid == legend_plot_id set( leg,'Color','none' ); set( leg,'Units','inches' ); if impl == 'octave' - set( leg,'FontSize',fontsize ); - set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl) + set( leg,'FontSize',fontsize ); + set( leg,'Position',[15.40 4.75 1.9 1.20] ); % (1,4tl) else - set( leg,'FontSize',fontsize-3 ); - set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl) + set( leg,'FontSize',fontsize-3 ); + set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl) end else leg = legend( ... @@ -249,11 +248,11 @@ if show_plot == 1 || theid == legend_plot_id set( leg,'Color','none' ); set( leg,'Units','inches' ); if impl == 'octave' - set( leg,'FontSize',fontsize ); - set( leg,'Position',[11.92 6.54 1.15 0.7 ] ); % (1,4tl) + set( leg,'FontSize',fontsize ); + set( leg,'Position',[15.40 7.65 1.9 1.10] ); % (1,4tl) else - set( leg,'FontSize',fontsize-1 ); - set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl) + set( leg,'FontSize',fontsize-1 ); + set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl) end end set( leg,'Box','off' ); @@ -272,17 +271,31 @@ box( ax1, 'on' ); titl = title( titlename ); set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'. +% The default is to align the plot title across whole figure, not the box. +% This is a hack to nudge the title back to the center of the box. if impl == 'octave' -tpos = get( titl, 'Position' ); % default is to align across whole figure, not box. -tpos(1) = tpos(1) + -40; -set( titl, 'Position', tpos ); % here we nudge it back to centered with box. + tpos = get( titl, 'Position' ); + % For some reason, the titles in the graphs in the last column start + % off in a different relative position than the graphs in the other + % columns. Here, we manually account for that. + if mod(theid-1,cols) == 6 + tpos(1) = tpos(1) + -10; + else + tpos(1) = tpos(1) + -40; + end + set( titl, 'Position', tpos ); + set( titl, 'FontSize', fontsize ); +else % impl == 'matlab' + tpos = get( titl, 'Position' ); + tpos(1) = tpos(1) + 90; + set( titl, 'Position', tpos ); end if theid > (rows-1)*cols -%xlab = xlabel( ax1,xaxisname ); -%tpos = get( xlab, 'Position' ) -%tpos(2) = tpos(2) + 10; -%set( xlab, 'Position', tpos ); + %xlab = xlabel( ax1,xaxisname ); + %tpos = get( xlab, 'Position' ) + %tpos(2) = tpos(2) + 10; + %set( xlab, 'Position', tpos ); if theid == rows*cols - 6 xlab = xlabel( ax1, 'm = 6; n = k' ); elseif theid == rows*cols - 5 @@ -301,14 +314,8 @@ if theid > (rows-1)*cols end if mod(theid-1,cols) == 0 -ylab = ylabel( ax1,yaxisname ); + ylab = ylabel( ax1,yaxisname ); end -%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' ); -%saveas( fig, filename_png ); - -%hold( ax1, 'off' ); - r_val = 0; -end diff --git a/test/sup/old/supst/octave/plot_panel_trxsh.m b/test/sup/old/supst/octave/plot_panel_trxsh.m new file mode 100644 index 000000000..8ba709257 --- /dev/null +++ b/test/sup/old/supst/octave/plot_panel_trxsh.m @@ -0,0 +1,172 @@ +function r_val = plot_panel_trxsh ... + ( ... + cfreq, ... + dflopspercycle, ... + nth, ... + thr_str, ... + dt_ch, ... + stor_str, ... + smalldims, ... + dirpath, ... + arch_str, ... + vend_str, ... + impl ... + ) + +%cfreq = 1.8; +%dflopspercycle = 32; + +% Create filename "templates" for the files that contain the performance +% results. +filetemp_blissup = '%s/output_%s_%s_blissup.m'; +filetemp_blislpab = '%s/output_%s_%s_blislpab.m'; +filetemp_eigen = '%s/output_%s_%s_eigen.m'; +filetemp_open = '%s/output_%s_%s_openblas.m'; +filetemp_bfeo = '%s/output_%s_%s_blasfeo.m'; +filetemp_xsmm = '%s/output_%s_%s_libxsmm.m'; +filetemp_vend = '%s/output_%s_%s_vendor.m'; + +% Create a variable name "template" for the variables contained in the +% files outlined above. +vartemp = 'data_%s_%s_%s( :, : )'; + +% Define the datatypes and operations we will be plotting. +oproot = sprintf( '%cgemm', dt_ch ); +ops( 1, : ) = sprintf( '%s_nn', oproot ); +ops( 2, : ) = sprintf( '%s_nt', oproot ); +ops( 3, : ) = sprintf( '%s_tn', oproot ); +ops( 4, : ) = sprintf( '%s_tt', oproot ); + +% Generate datatype-specific operation names from the set of operations +% and datatypes. +[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims ); +n_opsupnames = size( opsupnames, 1 ); + +%opsupnames +%opnames +%return + +if 1 == 1 + %fig = figure('Position', [100, 100, 2400, 1500]); + fig = figure('Position', [100, 100, 2400, 1200]); + orient( fig, 'portrait' ); + set(gcf,'PaperUnits', 'inches'); + if impl == 'matlab' + set(gcf,'PaperSize', [11.5 20.4]); + set(gcf,'PaperPosition', [0 0 11.5 20.4]); + set(gcf,'PaperPositionMode','manual'); + else % impl == 'octave' % octave 4.x + set(gcf,'PaperSize', [12 22.0]); + set(gcf,'PaperPositionMode','auto'); + end + set(gcf,'PaperOrientation','landscape'); +end + + +% Iterate over the list of datatype-specific operation names. +for opi = 1:n_opsupnames +%for opi = 1:1 + + % Grab the current datatype combination. + opsupname = opsupnames( opi, : ); + opname = opnames( opi, : ); + + str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str); + + % Construct filenames for the data files from templates. + file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname ); + file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname ); + file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname ); + file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname ); + file_bfeo = sprintf( filetemp_bfeo, dirpath, thr_str, opsupname ); + file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname ); + + % Load the data files. + %str = sprintf( ' Loading %s', file_blissup ); disp(str); + run( file_blissup ) + run( file_blislpab ) + run( file_eigen ) + run( file_open ) + run( file_bfeo ) + run( file_vend ) + + % Construct variable names for the variables in the data files. + var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' ); + var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' ); + var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' ); + var_open = sprintf( vartemp, thr_str, opname, 'openblas' ); + var_bfeo = sprintf( vartemp, thr_str, opname, 'blasfeo' ); + var_vend = sprintf( vartemp, thr_str, opname, 'vendor' ); + + % Use eval() to instantiate the variable names constructed above, + % copying each to a simplified name. + data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : ); + data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : ); + data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : ); + data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : ); + data_bfeo = eval( var_bfeo ); % e.g. data_st_dgemm_blasfeo( :, : ); + data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : ); + + if stor_str == 'ccc' + % Only read xsmm data for the column storage case, since that's the + % only format that libxsmm supports. + file_xsmm = sprintf( filetemp_xsmm, dirpath, thr_str, opsupname ); + run( file_xsmm ) + var_xsmm = sprintf( vartemp, thr_str, opname, 'libxsmm' ); + data_xsmm = eval( var_xsmm ); % e.g. data_st_dgemm_libxsmm( :, : ); + else + % Set the data variable to zeros using the same dimensions as the other + % variables. + data_xsmm = zeros( size( data_blissup, 1 ), ... + size( data_blissup, 2 ) ); + end + %str = sprintf( ' Reading %s', var_blissup ); disp(str); + %str = sprintf( ' Reading %s', var_blislpab ); disp(str); + %str = sprintf( ' Reading %s', var_eigen ); disp(str); + %str = sprintf( ' Reading %s', var_open ); disp(str); + %str = sprintf( ' Reading %s', var_bfeo ); disp(str); + %str = sprintf( ' Reading %s', var_xsmm ); disp(str); + %str = sprintf( ' Reading %s', var_vend ); disp(str); + + % Plot one result in an m x n grid of plots, via the subplot() + % function. + if 1 == 1 + plot_l3sup_perf( opsupname, ... + data_blissup, ... + data_blislpab, ... + data_eigen, ... + data_open, ... + data_bfeo, ... + data_xsmm, ... + data_vend, vend_str, ... + nth, ... + 4, 7, ... + cfreq, ... + dflopspercycle, ... + opi, impl ); + + clear data_st_*gemm_*; + clear data_blissup; + clear data_blislpab; + clear data_eigen; + clear data_open; + clear data_bfeo; + clear data_xsmm; + clear data_vend; + + end + +end + +% Construct the name of the file to which we will output the graph. +outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth ); + +% Output the graph to pdf format. +%print(gcf, 'gemm_md','-fillpage','-dpdf'); +%print(gcf, outfile,'-bestfit','-dpdf'); +if impl == 'octave' + print(gcf, outfile); +else % if impl == 'matlab' + print(gcf, outfile,'-bestfit','-dpdf'); +end + diff --git a/test/sup/old/supst/octave/runthese.m b/test/sup/old/supst/octave/runthese.m new file mode 100644 index 000000000..30e3865d1 --- /dev/null +++ b/test/sup/old/supst/octave/runthese.m @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + +% kabylake +plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20200302/mnkt100000_st','kbl','MKL','octave'); close; clear all; + +% haswell +plot_panel_trxsh(3.5,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20200302/mnkt100000_st','has','MKL','octave'); close; clear all; + +% epyc +plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20200302/mnkt100000_st','epyc','MKL','octave'); close; clear all; diff --git a/test/sup/old/supst/runme.sh b/test/sup/old/supst/runme.sh new file mode 100755 index 000000000..48dacfa3a --- /dev/null +++ b/test/sup/old/supst/runme.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# File pefixes. +exec_root="test" +out_root="output" + +# Placeholder until we add multithreading. +nt=1 + +# Delay between test cases. +delay=0.02 + +# Threadedness to test. +threads="st" + +# Datatypes to test. +#dts="d s" +dts="d" + +# Operations to test. +ops="gemm" + +# Transpose combintions to test. +trans="nn nt tn tt" + +# Storage combinations to test. +#stors="rrr rrc rcr rcc crr crc ccr ccc" +stors="rrr ccc" + +# Problem shapes to test. +shapes="sll lsl lls lss sls ssl lll" + +# FGVZ: figure out how to probe what's in the directory and +# execute everything that's there? +sms="6" +sns="8" +sks="4" + +# Implementations to test. +impls="vendor blissup blislpab openblas eigen libxsmm blasfeo" +#impls="vendor" +#impls="blissup" +#impls="blislpab" +#impls="openblas" +#impls="eigen" +#impls="libxsmm" +#impls="blasfeo" + +# Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x + +for th in ${threads}; do + + for dt in ${dts}; do + + for op in ${ops}; do + + for tr in ${trans}; do + + for st in ${stors}; do + + for sh in ${shapes}; do + + for sm in ${sms}; do + + for sn in ${sns}; do + + for sk in ${sks}; do + + for im in ${impls}; do + + # Limit execution of non-BLIS implementations to + # rrr/ccc storage cases. + if [ "${im:0:4}" != "blis" ] && \ + [ "${st}" != "rrr" ] && \ + [ "${st}" != "ccc" ]; then + continue; + fi + + # Further limit execution of libxsmm to + # ccc storage cases. + if [ "${im:0:7}" = "libxsmm" ] && \ + [ "${st}" != "ccc" ]; then + continue; + fi + + # Extract the shape chars for m, n, k. + chm=${sh:0:1} + chn=${sh:1:1} + chk=${sh:2:1} + + # Construct the shape substring (e.g. m6npkp) + shstr="" + + if [ ${chm} = "s" ]; then + shstr="${shstr}m${sm}" + else + shstr="${shstr}mp" + fi + + if [ ${chn} = "s" ]; then + shstr="${shstr}n${sn}" + else + shstr="${shstr}np" + fi + + if [ ${chk} = "s" ]; then + shstr="${shstr}k${sk}" + else + shstr="${shstr}kp" + fi + + # Ex: test_dgemm_nn_rrc_m6npkp_blissup_st.x + + # Construct the name of the test executable. + exec_name="${exec_root}_${dt}${op}_${tr}_${st}_${shstr}_${im}_${th}.x" + + # Construct the name of the output file. + out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m" + + echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}" + + # Run executable. + ./${exec_name} > ${out_file} + + sleep ${delay} + + done + done + done + done + done + done + done + done + done +done + diff --git a/test/supmt/test_gemm.c b/test/sup/old/supst/test_gemm.c similarity index 98% rename from test/supmt/test_gemm.c rename to test/sup/old/supst/test_gemm.c index 95e9d45b2..7f611b554 100644 --- a/test/supmt/test_gemm.c +++ b/test/sup/old/supst/test_gemm.c @@ -48,14 +48,9 @@ int main( int argc, char** argv ) { - rntm_t rntm_g; bli_init(); - // Copy the global rntm_t object in case we need it later when disabling - // sup. - bli_rntm_init_from_global( &rntm_g ); - #ifndef ERROR_CHECK bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); #endif @@ -298,8 +293,7 @@ int main( int argc, char** argv ) &c ); #else // Disable sup and use the expert interface. - //rntm_t rntm = BLIS_RNTM_INITIALIZER; - rntm_t rntm = rntm_g; + rntm_t rntm = BLIS_RNTM_INITIALIZER; bli_rntm_disable_l3_sup( &rntm ); bli_gemm_ex( &alpha, diff --git a/test/sup/runme.sh b/test/sup/runme.sh index 48dacfa3a..911fbbaa4 100755 --- a/test/sup/runme.sh +++ b/test/sup/runme.sh @@ -4,14 +4,41 @@ exec_root="test" out_root="output" -# Placeholder until we add multithreading. -nt=1 +sys="blis" +#sys="lonestar5" +#sys="ul252" +#sys="ul264" + +if [ ${sys} = "blis" ]; then + + export GOMP_CPU_AFFINITY="0-3" + nt=4 + +elif [ ${sys} = "lonestar5" ]; then + + export GOMP_CPU_AFFINITY="0-23" + nt=12 + +elif [ ${sys} = "ul252" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-51" + nt=26 + +elif [ ${sys} = "ul264" ]; then + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64" + export GOMP_CPU_AFFINITY="0-63" + nt=32 + +fi # Delay between test cases. delay=0.02 # Threadedness to test. -threads="st" +#threads="st mt" +threads="st mt" # Datatypes to test. #dts="d s" @@ -34,17 +61,19 @@ shapes="sll lsl lls lss sls ssl lll" # execute everything that's there? sms="6" sns="8" -sks="4" +sks="10" # Implementations to test. -impls="vendor blissup blislpab openblas eigen libxsmm blasfeo" +impls="vendor blissup blislpab openblas eigen" #impls="vendor" #impls="blissup" #impls="blislpab" #impls="openblas" #impls="eigen" -#impls="libxsmm" -#impls="blasfeo" + +# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can +# restore the value. +GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY} # Example: test_dgemm_nn_rrc_m6npkp_blissup_st.x @@ -68,6 +97,44 @@ for th in ${threads}; do for im in ${impls}; do + if [ "${th}" = "mt" ]; then + + # Specify the multithreading depending on which + # implementation is about to be tested. + if [ "${im:0:4}" = "blis" ]; then + unset OMP_NUM_THREADS + export BLIS_NUM_THREADS=${nt} + elif [ "${im}" = "openblas" ]; then + unset OMP_NUM_THREADS + export OPENBLAS_NUM_THREADS=${nt} + elif [ "${im}" = "eigen" ]; then + export OMP_NUM_THREADS=${nt} + elif [ "${im}" = "vendor" ]; then + unset OMP_NUM_THREADS + export MKL_NUM_THREADS=${nt} + fi + export nt_use=${nt} + + else # if [ "${th}" = "st" ]; + + # Use single-threaded execution. + export OMP_NUM_THREADS=1 + export BLIS_NUM_THREADS=1 + export OPENBLAS_NUM_THREADS=1 + export MKL_NUM_THREADS=1 + export nt_use=1 + fi + + # Multithreaded OpenBLAS seems to have a problem + # running properly if GOMP_CPU_AFFINITY is set. + # So we temporarily unset it here if we are about + # to execute OpenBLAS, but otherwise restore it. + if [ ${im} = "openblas" ]; then + unset GOMP_CPU_AFFINITY + else + export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}" + fi + # Limit execution of non-BLIS implementations to # rrr/ccc storage cases. if [ "${im:0:4}" != "blis" ] && \ @@ -117,7 +184,7 @@ for th in ${threads}; do # Construct the name of the output file. out_file="${out_root}_${th}_${dt}${op}_${tr}_${st}_${shstr}_${im}.m" - echo "Running (nt = ${nt}) ./${exec_name} > ${out_file}" + echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}" # Run executable. ./${exec_name} > ${out_file} diff --git a/test/sup/test_gemm.c b/test/sup/test_gemm.c index 7f611b554..23cc56440 100644 --- a/test/sup/test_gemm.c +++ b/test/sup/test_gemm.c @@ -48,9 +48,17 @@ int main( int argc, char** argv ) { + rntm_t rntm_g; bli_init(); + // Copy the global rntm_t object so that we can use it later when disabling + // sup. Starting with a copy of the global rntm_t is actually necessary; + // if we start off with a locally-initialized rntm_t, it will not contain + // the ways of parallelism that were conveyed via environment variables, + // which is necessary when running this driver with multiple BLIS threads. + bli_rntm_init_from_global( &rntm_g ); + #ifndef ERROR_CHECK bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); #endif @@ -292,8 +300,14 @@ int main( int argc, char** argv ) &beta, &c ); #else + // NOTE: We can't use the static initializer and must instead + // initialize the rntm_t with the copy from the global rntm_t we + // made at the beginning of main(). Please see the comment there + // for more info on why BLIS_RNTM_INITIALIZER doesn't work here. + //rntm_t rntm = BLIS_RNTM_INITIALIZER; + rntm_t rntm = rntm_g; + // Disable sup and use the expert interface. - rntm_t rntm = BLIS_RNTM_INITIALIZER; bli_rntm_disable_l3_sup( &rntm ); bli_gemm_ex( &alpha, diff --git a/test/supmt/octave/runthese.m b/test/supmt/octave/runthese.m deleted file mode 100644 index 5946d4796..000000000 --- a/test/supmt/octave/runthese.m +++ /dev/null @@ -1,12 +0,0 @@ - -% haswell -plot_panel_trxsh(3.25,16,1,'mt','d','ccc',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.25,16,1,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20190823/4_800_4_mt201','has','MKL','matlab'); close; clear all; - -% kabylake -plot_panel_trxsh(3.80,16,1,'mt','d','rrr',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.80,16,1,'mt','d','ccc',[ 6 8 10 ],'..','kbl','MKL','matlab'); close; clear all; - -% epyc -plot_panel_trxsh(3.00, 8,1,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all; -plot_panel_trxsh(3.00, 8,1,'mt','d','ccc',[ 6 8 10 ],'../results/epyc/20190826/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;