From b426f9e04e5499c6f9c752e49c33800bfaadda4c Mon Sep 17 00:00:00 2001 From: Nicholai Tukanov Date: Fri, 1 Nov 2019 17:57:03 -0500 Subject: [PATCH] POWER9 DGEMM (#355) Implemented and registered power9 dgemm ukernel. Details: - Implemented 12x6 dgemm microkernel for power9. This microkernel assumes that elements of B have been duplicated/broadcast during the packing step. The microkernel uses a column orientation for its microtile vector registers and thus implements column storage and general stride IO cases. (A row storage IO case via in-register transposition may be added at a future date.) It should be noted that we recommend using this microkernel with gcc and *not* xlc, as issues with the latter cropped up during development, including but not limited to slightly incompatible vector register mnemonics in the GNU extended inline assembly clobber list. --- config/power9/bli_cntx_init_power9.c | 71 +- config/power9/bli_family_power9.h | 32 +- config/power9/make_defs.mk | 24 +- config_registry | 2 +- frame/3/gemm/bli_gemm_front.c | 4 + frame/3/gemm/bli_gemm_ker_var2.c | 2 +- frame/base/bli_arch.c | 8 +- frame/base/bli_cntx.h | 11 - frame/include/bli_arch_config.h | 3 + kernels/haswell/bli_kernels_haswell.h | 2 +- kernels/power9/3/bli_gemm_power9_asm_d12x6.c | 201 +++ kernels/power9/3/bli_pwr9_asm_macros_12x6.h | 1574 ++++++++++++++++++ kernels/power9/bli_kernels_power9.h | 47 + test/3/Makefile | 10 +- test/3/Makefile_cpy1 | 464 ++++++ test/3/runme.sh | 9 +- test/3/test_gemm.c | 836 +++++----- test/Makefile | 37 +- test/output_gemm_blis.m | 0 test/runme.sh | 14 +- testsuite/input.general | 14 +- testsuite/input.operations | 6 +- testsuite/jobscripts/cfig.out | 106 ++ testsuite/jobscripts/cfig.sh | 5 + testsuite/jobscripts/jb-cfig.sh | 26 + testsuite/jobscripts/jb-mk.sh | 26 + testsuite/jobscripts/jb-runtest.sh | 26 + testsuite/jobscripts/mk.out | 9 + testsuite/jobscripts/mk.sh | 6 + testsuite/jobscripts/runtest.sh | 8 + testsuite/src/test_gemm.c | 27 +- 31 files changed, 3049 insertions(+), 561 deletions(-) create mode 100644 kernels/power9/3/bli_gemm_power9_asm_d12x6.c create mode 100644 kernels/power9/3/bli_pwr9_asm_macros_12x6.h create mode 100644 kernels/power9/bli_kernels_power9.h create mode 100644 test/3/Makefile_cpy1 create mode 100644 test/output_gemm_blis.m create mode 100644 testsuite/jobscripts/cfig.out create mode 100755 testsuite/jobscripts/cfig.sh create mode 100644 testsuite/jobscripts/jb-cfig.sh create mode 100644 testsuite/jobscripts/jb-mk.sh create mode 100644 testsuite/jobscripts/jb-runtest.sh create mode 100644 testsuite/jobscripts/mk.out create mode 100755 testsuite/jobscripts/mk.sh create mode 100755 testsuite/jobscripts/runtest.sh diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c index 410569611..d596ba3ae 100644 --- a/config/power9/bli_cntx_init_power9.c +++ b/config/power9/bli_cntx_init_power9.c @@ -34,44 +34,55 @@ #include "blis.h" +// Instantiate prototypes for packm kernels. +PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) + +// Instantiate prototypes for level-3 kernels. +//GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) + + void bli_cntx_init_power9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_power9_ref( cntx ); - - // ------------------------------------------------------------------------- - + // Update the context with optimized native gemm micro-kernels and - // their storage preferences. -// bli_cntx_set_l3_nat_ukrs -// ( -// 1, -// BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE, -// cntx -// ); -/* - // Initialize level-3 blocksize objects with architecture-specific values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 ); + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 1, + //BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, + cntx + ); + + // Update the context with optimized packm kernels. + bli_cntx_set_packm_kers + ( + 1, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, + cntx + ); + + bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1, + -1, 12, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 ); - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs - ( - BLIS_NAT, 5, - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - cntx - ); -*/ + ( + BLIS_NAT, 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); } diff --git a/config/power9/bli_family_power9.h b/config/power9/bli_family_power9.h index 202f1f854..702e6ad5b 100644 --- a/config/power9/bli_family_power9.h +++ b/config/power9/bli_family_power9.h @@ -32,34 +32,10 @@ */ -//#ifndef BLIS_FAMILY_H -//#define BLIS_FAMILY_H +#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096 +#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096 -//#define BLIS_SIMD_NUM_REGISTERS 32 -//#define BLIS_SIMD_SIZE 64 -// -//#ifdef BLIS_NO_HBWMALLOC -// #include -// #define BLIS_MALLOC_POOL malloc -// #define BLIS_FREE_POOL free -//#else -// #include -// #define BLIS_MALLOC_POOL hbw_malloc -// #define BLIS_FREE_POOL hbw_free -//#endif +#define BLIS_POOL_ADDR_OFFSET_SIZE_A 192 +#define BLIS_POOL_ADDR_OFFSET_SIZE_B 152 -#if 0 -// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- - -#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4 -#define BLIS_DEFAULT_MR_D 8 -#define BLIS_DEFAULT_NR_D 4 -#define BLIS_DEFAULT_MC_D 64 -#define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 4096 -#endif - - -//#endif - diff --git a/config/power9/make_defs.mk b/config/power9/make_defs.mk index b2c78b16a..1130f9d94 100644 --- a/config/power9/make_defs.mk +++ b/config/power9/make_defs.mk @@ -1,3 +1,4 @@ + # # # BLIS @@ -45,8 +46,8 @@ THIS_CONFIG := power9 # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. -CPPROCFLAGS := -CMISCFLAGS := -mcpu=power9 +CPPROCFLAGS := +CMISCFLAGS := CPICFLAGS := CWARNFLAGS := @@ -57,28 +58,25 @@ endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else -COPTFLAGS := -O3 -funroll-loops +COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) -CKVECFLAGS := +CKVECFLAGS := -mcpu=power9 -mtune=power9 -DXLC=0 else -$(error gcc is required for this configuration.) +ifeq ($(CC_VENDOR),IBM) +CKVECFLAGS := -qarch=pwr9 -qtune=pwr9 -DXLC=1 +else +$(info $(CC_VENDOR)) +$(error gcc/xlc is required for this configuration.) +endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) -ifeq ($(CC_VENDOR),gcc) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else -ifeq ($(CC_VENDOR),clang) -CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast -else CRVECFLAGS := $(CKVECFLAGS) -endif -endif # Store all of the variables here to new variables containing the # configuration name. diff --git a/config_registry b/config_registry index ed859cd3c..acd49f622 100644 --- a/config_registry +++ b/config_registry @@ -39,7 +39,7 @@ cortexa15: cortexa15/armv7a cortexa9: cortexa9/armv7a # IBM architectures. -power9: power9/generic +power9: power9 bgq: bgq # Generic architectures. diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index c3b2c528d..7aebd1ac0 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -66,6 +66,7 @@ void bli_gemm_front #endif #endif + // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); @@ -82,6 +83,7 @@ void bli_gemm_front bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); + #ifdef BLIS_ENABLE_GEMM_MD cntx_t cntx_local; @@ -148,6 +150,7 @@ void bli_gemm_front // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. + if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); @@ -275,6 +278,7 @@ void bli_gemm_front cntl ); + #ifdef BLIS_ENABLE_GEMM_MD #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM // If we created a temporary matrix conformal to C for whatever reason, diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index e9b43f2ac..1ecb5d714 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -167,7 +167,7 @@ void bli_gemm_ker_var2 // Index into the type combination array to extract the correct // function pointer. - f = ftypes[dt_exec]; + f = ftypes[dt_exec]; // Invoke the function. f( schema_a, diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 09388da0d..5489b22a2 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -141,15 +141,15 @@ void bli_arch_set_id( void ) #endif // IBM microarchitectures. +#ifdef BLIS_FAMILY_POWER9 + id = BLIS_ARCH_POWER9; +#endif #ifdef BLIS_FAMILY_POWER7 id = BLIS_ARCH_POWER7; #endif #ifdef BLIS_FAMILY_BGQ id = BLIS_ARCH_BGQ; #endif -#ifdef BLIS_FAMILY_POWER9 - id = BLIS_ARCH_POWER9; -#endif // Generic microarchitecture. #ifdef BLIS_FAMILY_GENERIC @@ -188,9 +188,9 @@ static char* config_name[ BLIS_NUM_ARCHS ] = "cortexa15", "cortexa9", + "power9", "power7", "bgq", - "power9", "generic" }; diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 798d23000..44bad36e3 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -1,13 +1,10 @@ /* - BLIS An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2019, Advanced Micro Devices, Inc. - Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -19,7 +16,6 @@ - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -31,7 +27,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ #ifndef BLIS_CNTX_H @@ -45,28 +40,22 @@ typedef struct cntx_s { blksz_t* blkszs; bszid_t* bmults; - func_t* l3_vir_ukrs; func_t* l3_nat_ukrs; mbool_t* l3_nat_ukrs_prefs; - blksz_t* l3_sup_thresh; void** l3_sup_handlers; blksz_t* l3_sup_blkszs; func_t* l3_sup_kers; mbool_t* l3_sup_kers_prefs; - func_t* l1f_kers; func_t* l1v_kers; - func_t* packm_kers; func_t* unpackm_kers; - ind_t method; pack_t schema_a; pack_t schema_b; pack_t schema_c; - } cntx_t; */ diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 5aced8886..b303dbb14 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -268,6 +268,9 @@ CNTX_INIT_PROTS( generic ) // -- IBM BG/Q -- +#ifdef BLIS_KERNELS_POWER9 +#include "bli_kernels_power9.h" +#endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" #endif diff --git a/kernels/haswell/bli_kernels_haswell.h b/kernels/haswell/bli_kernels_haswell.h index df49a77dd..6a88d8056 100644 --- a/kernels/haswell/bli_kernels_haswell.h +++ b/kernels/haswell/bli_kernels_haswell.h @@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) -// gemm (asm d8x6) +// gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c new file mode 100644 index 000000000..187182a09 --- /dev/null +++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c @@ -0,0 +1,201 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "bli_pwr9_asm_macros_12x6.h" + +void bli_dgemm_power9_asm_12x6 + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + + uint64_t k_iter = k0 / 16; + uint64_t k_left = k0 % 16; + + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + + __asm__ volatile + ( + " \n\t" + "ld %%r7, %2 \n\t" // load ptr of A + "ld %%r8, %3 \n\t" // load ptr of B + "ld %%r16, %6 \n\t" // load ptr of C + " \n\t" + "ld %%r28, %4 \n\t" // load ptr for alpha + "ld %%r29, %5 \n\t" // load ptr for beta + " \n\t" + "ld %%r11, %0 \n\t" // load k_iter + "ld %%r12, %1 \n\t" // load k_left + " \n\t" + "ld %%r10, %8 \n\t" // load cs_c + "slwi %%r10, %%r10, 3 \n\t" // mul by size of elem + " \n\t" + "ld %%r9, %7 \n\t" // load rs_c + "slwi %%r9, %%r9, 3 \n\t" // mul by size of elem + " \n\t" + "ld %%r26, 0(%%r29) \n\t" // load val of beta + " \n\t" + "lxvdsx %%vs62, 0, %%r28 \n\t" // splat alpha + "lxvdsx %%vs63, 0, %%r29 \n\t" // splat beta + " \n\t" + "add %%r17, %%r16, %%r10 \n\t" // addr of col 1 of C + "add %%r18, %%r17, %%r10 \n\t" // col 2 of C + "add %%r19, %%r18, %%r10 \n\t" // col 3 of C + "add %%r20, %%r19, %%r10 \n\t" // col 4 of C + "add %%r21, %%r20, %%r10 \n\t" // col 5 of C + " \n\t" + DZERO_OUT_VREG + " \n\t" + DPRELOAD + " \n\t" + "addi %%r8, %%r8, 96 \n\t" // move to next col/row of A/B + "addi %%r7, %%r7, 96 \n\t" + " \n\t" + DPREFETCH + " \n\t" + "cmpwi %%r0, %%r11, 0 \n\t" // if k_iter == 0, + "beq %%r0, DCONSIDERKLEFT \n\t" // then jmp to k_left + "mtctr %%r11 \n\t" // else, do k_iter loop + " \n\t" + "DLOOPKITER: \n\t" // k_iter loop + " \n\t" + A_B_PRODUCT_16 // compute A*B + " \n\t" + "bdnz DLOOPKITER \n\t" + " \n\t" + "DCONSIDERKLEFT: \n\t" + " \n\t" + "cmpwi %%r0, %%r12, 0 \n\t" // if k_left == 0, + "beq %%r0, DPOSTACCUM \n\t" // then jmp to post accum + "mtctr %%r12 \n\t" // else, do k_left loop + " \n\t" + "DLOOPKLEFT: \n\t" // k_left loop + " \n\t" + A_B_PRODUCT_1 + " \n\t" + "bdnz DLOOPKLEFT \n\t" + " \n\t" + "DPOSTACCUM: \n\t" + " \n\t" + DSCALE_ALPHA + " \n\t" + "cmpdi %%r0, %%r26, 0 \n\t" // if beta == 0, + "beq %%r0, DBETAZERO \n\t" // then jmp to BZ + " \n\t" + "cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8 + "beq DCOLSTOREDBNZ \n\t" // then jmp to col store + " \n\t" + "DGENSTOREDBNZ: \n\t" // BNZ gen stored case + " \n\t" + DGEN_LOAD_OFS_C + " \n\t" + DGEN_SCALE_BETA + " \n\t" + "b DGENSTORED \n\t" + " \n\t" + "DCOLSTOREDBNZ: \n\t" // BNZ col stored case + " \n\t" + DCOL_SCALE_BETA + " \n\t" + "b DCOLSTORED \n\t" + " \n\t" + "DBETAZERO: \n\t" // BZ case + " \n\t" + "cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8, + "beq DCOLSTORED \n\t" // C is col stored + " \n\t" + "DGENSTORED: \n\t" // BZ gen stored case + " \n\t" + DGEN_LOAD_OFS_C + " \n\t" + DGEN_STORE + " \n\t" + "b DDONE \n\t" + " \n\t" + "DCOLSTORED: \n\t" // BZ col stored case + " \n\t" + DCOL_STORE + " \n\t" + "DDONE: \n\t" + " \n\t" + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + /* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */ + "r0", "r7", "r8", "r9", + "r10", "r11", "r12", "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29" + + #if XLC + ,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9" + , "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19" + , "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29" + , "f30" ,"f31" + , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9" + , "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19" + , "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" + , "v30", "v31" + #else + , "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9" + , "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19" + , "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29" + , "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39" + , "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49" + , "vs50", "vs51", "vs52", "vs53" + #endif + + ); +} diff --git a/kernels/power9/3/bli_pwr9_asm_macros_12x6.h b/kernels/power9/3/bli_pwr9_asm_macros_12x6.h new file mode 100644 index 000000000..f47f82952 --- /dev/null +++ b/kernels/power9/3/bli_pwr9_asm_macros_12x6.h @@ -0,0 +1,1574 @@ + +// MACROS for power9_asm_d12x6 + + +// zero out registers used to store result +#define DZERO_OUT_VREG \ +"xxlxor %%vs0, %%vs0, %%vs0 \n\t" \ +"xxlxor %%vs1, %%vs1, %%vs1 \n\t" \ +"xxlxor %%vs2, %%vs2, %%vs2 \n\t" \ +"xxlxor %%vs3, %%vs3, %%vs3 \n\t" \ +"xxlxor %%vs4, %%vs4, %%vs4 \n\t" \ +"xxlxor %%vs5, %%vs5, %%vs5 \n\t" \ +"xxlxor %%vs6, %%vs6, %%vs6 \n\t" \ +"xxlxor %%vs7, %%vs7, %%vs7 \n\t" \ +"xxlxor %%vs8, %%vs8, %%vs8 \n\t" \ +"xxlxor %%vs9, %%vs9, %%vs9 \n\t" \ +"xxlxor %%vs10, %%vs10, %%vs10 \n\t" \ +"xxlxor %%vs11, %%vs11, %%vs11 \n\t" \ +"xxlxor %%vs12, %%vs12, %%vs12 \n\t" \ +"xxlxor %%vs13, %%vs13, %%vs13 \n\t" \ +"xxlxor %%vs14, %%vs14, %%vs14 \n\t" \ +"xxlxor %%vs15, %%vs15, %%vs15 \n\t" \ +"xxlxor %%vs16, %%vs16, %%vs16 \n\t" \ +"xxlxor %%vs17, %%vs17, %%vs17 \n\t" \ +"xxlxor %%vs18, %%vs18, %%vs18 \n\t" \ +"xxlxor %%vs19, %%vs19, %%vs19 \n\t" \ +"xxlxor %%vs20, %%vs20, %%vs20 \n\t" \ +"xxlxor %%vs21, %%vs21, %%vs21 \n\t" \ +"xxlxor %%vs22, %%vs22, %%vs22 \n\t" \ +"xxlxor %%vs23, %%vs23, %%vs23 \n\t" \ +"xxlxor %%vs24, %%vs24, %%vs24 \n\t" \ +"xxlxor %%vs25, %%vs25, %%vs25 \n\t" \ +"xxlxor %%vs26, %%vs26, %%vs26 \n\t" \ +"xxlxor %%vs27, %%vs27, %%vs27 \n\t" \ +"xxlxor %%vs28, %%vs28, %%vs28 \n\t" \ +"xxlxor %%vs29, %%vs29, %%vs29 \n\t" \ +"xxlxor %%vs30, %%vs30, %%vs30 \n\t" \ +"xxlxor %%vs31, %%vs31, %%vs31 \n\t" \ +"xxlxor %%vs32, %%vs32, %%vs32 \n\t" \ +"xxlxor %%vs33, %%vs33, %%vs33 \n\t" \ +"xxlxor %%vs34, %%vs34, %%vs34 \n\t" \ +"xxlxor %%vs35, %%vs35, %%vs35 \n\t" + +#define DPREFETCH \ +"dcbt 0, %%r16 \n\t" \ +"dcbt 0, %%r17 \n\t" \ +"dcbt 0, %%r18 \n\t" \ +"dcbt 0, %%r19 \n\t" \ +"dcbt 0, %%r20 \n\t" \ +"dcbt 0, %%r21 \n\t" + +// preload col/row of A/B +#define DPRELOAD \ +"lxv %%vs36, 0(%%r7) \n\t" \ +"lxv %%vs37, 16(%%r7) \n\t" \ +"lxv %%vs38, 32(%%r7) \n\t" \ +"lxv %%vs39, 48(%%r7) \n\t" \ +"lxv %%vs40, 64(%%r7) \n\t" \ +"lxv %%vs41, 80(%%r7) \n\t" \ +" \n\t" \ +"lxv %%vs48, 0(%%r8) \n\t" \ +"lxv %%vs49, 16(%%r8) \n\t" \ +"lxv %%vs50, 32(%%r8) \n\t" \ +"lxv %%vs51, 48(%%r8) \n\t" \ +"lxv %%vs52, 64(%%r8) \n\t" \ +"lxv %%vs53, 80(%%r8) \n\t" + +// compute AB product +// unrolled by 16 +#define A_B_PRODUCT_16 \ +" \n\t" \ +"lxv %%vs42, 0(%%r7) \n\t" \ +"lxv %%vs43, 16(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 0(%%r8) \n\t" \ +"lxv %%vs55, 16(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 32(%%r7) \n\t" \ +"lxv %%vs45, 48(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 32(%%r8) \n\t" \ +"lxv %%vs57, 48(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 64(%%r7) \n\t" \ +"lxv %%vs47, 80(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 64(%%r8) \n\t" \ +"lxv %%vs59, 80(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 96(%%r7) \n\t" \ +"lxv %%vs37, 112(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 96(%%r8) \n\t" \ +"lxv %%vs49, 112(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 128(%%r7) \n\t" \ +"lxv %%vs39, 144(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 128(%%r8) \n\t" \ +"lxv %%vs51, 144(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 160(%%r7) \n\t" \ +"lxv %%vs41, 176(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 160(%%r8) \n\t" \ +"lxv %%vs53, 176(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 192(%%r7) \n\t" \ +"lxv %%vs43, 208(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 192(%%r8) \n\t" \ +"lxv %%vs55, 208(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 224(%%r7) \n\t" \ +"lxv %%vs45, 240(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 224(%%r8) \n\t" \ +"lxv %%vs57, 240(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 256(%%r7) \n\t" \ +"lxv %%vs47, 272(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 256(%%r8) \n\t" \ +"lxv %%vs59, 272(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 288(%%r7) \n\t" \ +"lxv %%vs37, 304(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 288(%%r8) \n\t" \ +"lxv %%vs49, 304(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 320(%%r7) \n\t" \ +"lxv %%vs39, 336(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 320(%%r8) \n\t" \ +"lxv %%vs51, 336(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 352(%%r7) \n\t" \ +"lxv %%vs41, 368(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 352(%%r8) \n\t" \ +"lxv %%vs53, 368(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 384(%%r7) \n\t" \ +"lxv %%vs43, 400(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 384(%%r8) \n\t" \ +"lxv %%vs55, 400(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 416(%%r7) \n\t" \ +"lxv %%vs45, 432(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 416(%%r8) \n\t" \ +"lxv %%vs57, 432(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 448(%%r7) \n\t" \ +"lxv %%vs47, 464(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 448(%%r8) \n\t" \ +"lxv %%vs59, 464(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 480(%%r7) \n\t" \ +"lxv %%vs37, 496(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 480(%%r8) \n\t" \ +"lxv %%vs49, 496(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 512(%%r7) \n\t" \ +"lxv %%vs39, 528(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 512(%%r8) \n\t" \ +"lxv %%vs51, 528(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 544(%%r7) \n\t" \ +"lxv %%vs41, 560(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 544(%%r8) \n\t" \ +"lxv %%vs53, 560(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 576(%%r7) \n\t" \ +"lxv %%vs43, 592(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 576(%%r8) \n\t" \ +"lxv %%vs55, 592(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 608(%%r7) \n\t" \ +"lxv %%vs45, 624(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 608(%%r8) \n\t" \ +"lxv %%vs57, 624(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 640(%%r7) \n\t" \ +"lxv %%vs47, 656(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 640(%%r8) \n\t" \ +"lxv %%vs59, 656(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 672(%%r7) \n\t" \ +"lxv %%vs37, 688(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 672(%%r8) \n\t" \ +"lxv %%vs49, 688(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 704(%%r7) \n\t" \ +"lxv %%vs39, 720(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 704(%%r8) \n\t" \ +"lxv %%vs51, 720(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 736(%%r7) \n\t" \ +"lxv %%vs41, 752(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 736(%%r8) \n\t" \ +"lxv %%vs53, 752(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 768(%%r7) \n\t" \ +"lxv %%vs43, 784(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 768(%%r8) \n\t" \ +"lxv %%vs55, 784(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 800(%%r7) \n\t" \ +"lxv %%vs45, 816(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 800(%%r8) \n\t" \ +"lxv %%vs57, 816(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 832(%%r7) \n\t" \ +"lxv %%vs47, 848(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 832(%%r8) \n\t" \ +"lxv %%vs59, 848(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 864(%%r7) \n\t" \ +"lxv %%vs37, 880(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 864(%%r8) \n\t" \ +"lxv %%vs49, 880(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 896(%%r7) \n\t" \ +"lxv %%vs39, 912(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 896(%%r8) \n\t" \ +"lxv %%vs51, 912(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 928(%%r7) \n\t" \ +"lxv %%vs41, 944(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 928(%%r8) \n\t" \ +"lxv %%vs53, 944(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 960(%%r7) \n\t" \ +"lxv %%vs43, 976(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 960(%%r8) \n\t" \ +"lxv %%vs55, 976(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 992(%%r7) \n\t" \ +"lxv %%vs45, 1008(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 992(%%r8) \n\t" \ +"lxv %%vs57, 1008(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 1024(%%r7) \n\t" \ +"lxv %%vs47, 1040(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 1024(%%r8) \n\t" \ +"lxv %%vs59, 1040(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 1056(%%r7) \n\t" \ +"lxv %%vs37, 1072(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 1056(%%r8) \n\t" \ +"lxv %%vs49, 1072(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 1088(%%r7) \n\t" \ +"lxv %%vs39, 1104(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 1088(%%r8) \n\t" \ +"lxv %%vs51, 1104(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 1120(%%r7) \n\t" \ +"lxv %%vs41, 1136(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 1120(%%r8) \n\t" \ +"lxv %%vs53, 1136(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 1152(%%r7) \n\t" \ +"lxv %%vs43, 1168(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 1152(%%r8) \n\t" \ +"lxv %%vs55, 1168(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 1184(%%r7) \n\t" \ +"lxv %%vs45, 1200(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 1184(%%r8) \n\t" \ +"lxv %%vs57, 1200(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 1216(%%r7) \n\t" \ +"lxv %%vs47, 1232(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 1216(%%r8) \n\t" \ +"lxv %%vs59, 1232(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs36, 1248(%%r7) \n\t" \ +"lxv %%vs37, 1264(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"lxv %%vs48, 1248(%%r8) \n\t" \ +"lxv %%vs49, 1264(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"lxv %%vs38, 1280(%%r7) \n\t" \ +"lxv %%vs39, 1296(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"lxv %%vs50, 1280(%%r8) \n\t" \ +"lxv %%vs51, 1296(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"lxv %%vs40, 1312(%%r7) \n\t" \ +"lxv %%vs41, 1328(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"lxv %%vs52, 1312(%%r8) \n\t" \ +"lxv %%vs53, 1328(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"lxv %%vs42, 1344(%%r7) \n\t" \ +"lxv %%vs43, 1360(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +"lxv %%vs54, 1344(%%r8) \n\t" \ +"lxv %%vs55, 1360(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +"lxv %%vs44, 1376(%%r7) \n\t" \ +"lxv %%vs45, 1392(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +"lxv %%vs56, 1376(%%r8) \n\t" \ +"lxv %%vs57, 1392(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +"lxv %%vs46, 1408(%%r7) \n\t" \ +"lxv %%vs47, 1424(%%r7) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +"lxv %%vs58, 1408(%%r8) \n\t" \ +"lxv %%vs59, 1424(%%r8) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs42, %%vs54 \n\t" \ +"xvmaddadp %%vs1, %%vs43, %%vs54 \n\t" \ +"xvmaddadp %%vs2, %%vs44, %%vs54 \n\t" \ +"xvmaddadp %%vs3, %%vs45, %%vs54 \n\t" \ +"xvmaddadp %%vs4, %%vs46, %%vs54 \n\t" \ +"xvmaddadp %%vs5, %%vs47, %%vs54 \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs55 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs55 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs55 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs55 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs55 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs55 \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs42, %%vs56 \n\t" \ +"xvmaddadp %%vs13, %%vs43, %%vs56 \n\t" \ +"xvmaddadp %%vs14, %%vs44, %%vs56 \n\t" \ +"xvmaddadp %%vs15, %%vs45, %%vs56 \n\t" \ +"xvmaddadp %%vs16, %%vs46, %%vs56 \n\t" \ +"xvmaddadp %%vs17, %%vs47, %%vs56 \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs42, %%vs57 \n\t" \ +"xvmaddadp %%vs19, %%vs43, %%vs57 \n\t" \ +"xvmaddadp %%vs20, %%vs44, %%vs57 \n\t" \ +"xvmaddadp %%vs21, %%vs45, %%vs57 \n\t" \ +"xvmaddadp %%vs22, %%vs46, %%vs57 \n\t" \ +"xvmaddadp %%vs23, %%vs47, %%vs57 \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs58 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs58 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs58 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs58 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs58 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs58 \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs42, %%vs59 \n\t" \ +"xvmaddadp %%vs31, %%vs43, %%vs59 \n\t" \ +"xvmaddadp %%vs32, %%vs44, %%vs59 \n\t" \ +"xvmaddadp %%vs33, %%vs45, %%vs59 \n\t" \ +"xvmaddadp %%vs34, %%vs46, %%vs59 \n\t" \ +"xvmaddadp %%vs35, %%vs47, %%vs59 \n\t" \ +" \n\t" \ +"lxv %%vs36, 1440(%%r7) \n\t" \ +"lxv %%vs37, 1456(%%r7) \n\t" \ +" \n\t" \ +"lxv %%vs48, 1440(%%r8) \n\t" \ +"lxv %%vs49, 1456(%%r8) \n\t" \ +" \n\t" \ +"lxv %%vs38, 1472(%%r7) \n\t" \ +"lxv %%vs39, 1488(%%r7) \n\t" \ +" \n\t" \ +"lxv %%vs50, 1472(%%r8) \n\t" \ +"lxv %%vs51, 1488(%%r8) \n\t" \ +" \n\t" \ +"lxv %%vs40, 1504(%%r7) \n\t" \ +"lxv %%vs41, 1520(%%r7) \n\t" \ +" \n\t" \ +"lxv %%vs52, 1504(%%r8) \n\t" \ +"lxv %%vs53, 1520(%%r8) \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +" \n\t" \ +"addi %%r8, %%r8, 1536 \n\t" \ +"addi %%r7, %%r7, 1536 \n\t" + +// compute AB product +// no unrolling +#define A_B_PRODUCT_1 \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs48 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs48 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs48 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs48 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs48 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs48 \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs49 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs49 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs49 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs49 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs49 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs49 \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs50 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs50 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs50 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs50 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs50 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs50 \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs51 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs51 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs51 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs51 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs51 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs51 \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs52 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs52 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs52 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs52 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs52 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs52 \n\t" \ +" \n\t" \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs53 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs53 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs53 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs53 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs53 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs53 \n\t" \ +" \n\t" \ +"lxv %%vs48, 0(%%r8) \n\t" \ +"lxv %%vs49, 16(%%r8) \n\t" \ +"lxv %%vs50, 32(%%r8) \n\t" \ +"lxv %%vs51, 48(%%r8) \n\t" \ +"lxv %%vs52, 64(%%r8) \n\t" \ +"lxv %%vs53, 80(%%r8) \n\t" \ +" \n\t" \ +"lxv %%vs36, 0(%%r7) \n\t" \ +"lxv %%vs37, 16(%%r7) \n\t" \ +"lxv %%vs38, 32(%%r7) \n\t" \ +"lxv %%vs39, 48(%%r7) \n\t" \ +"lxv %%vs40, 64(%%r7) \n\t" \ +"lxv %%vs41, 80(%%r7) \n\t" \ +" \n\t" \ +"addi %%r8, %%r8, 96 \n\t" \ +"addi %%r7, %%r7, 96 \n\t" \ + +// scale AB product by alpha +#define DSCALE_ALPHA \ +"xvmuldp %%vs0, %%vs0, %%vs62 \n\t" \ +"xvmuldp %%vs1, %%vs1, %%vs62 \n\t" \ +"xvmuldp %%vs2, %%vs2, %%vs62 \n\t" \ +"xvmuldp %%vs3, %%vs3, %%vs62 \n\t" \ +"xvmuldp %%vs4, %%vs4, %%vs62 \n\t" \ +"xvmuldp %%vs5, %%vs5, %%vs62 \n\t" \ +"xvmuldp %%vs6, %%vs6, %%vs62 \n\t" \ +"xvmuldp %%vs7, %%vs7, %%vs62 \n\t" \ +"xvmuldp %%vs8, %%vs8, %%vs62 \n\t" \ +"xvmuldp %%vs9, %%vs9, %%vs62 \n\t" \ +"xvmuldp %%vs10, %%vs10, %%vs62 \n\t" \ +"xvmuldp %%vs11, %%vs11, %%vs62 \n\t" \ +"xvmuldp %%vs12, %%vs12, %%vs62 \n\t" \ +"xvmuldp %%vs13, %%vs13, %%vs62 \n\t" \ +"xvmuldp %%vs14, %%vs14, %%vs62 \n\t" \ +"xvmuldp %%vs15, %%vs15, %%vs62 \n\t" \ +"xvmuldp %%vs16, %%vs16, %%vs62 \n\t" \ +"xvmuldp %%vs17, %%vs17, %%vs62 \n\t" \ +"xvmuldp %%vs18, %%vs18, %%vs62 \n\t" \ +"xvmuldp %%vs19, %%vs19, %%vs62 \n\t" \ +"xvmuldp %%vs20, %%vs20, %%vs62 \n\t" \ +"xvmuldp %%vs21, %%vs21, %%vs62 \n\t" \ +"xvmuldp %%vs22, %%vs22, %%vs62 \n\t" \ +"xvmuldp %%vs23, %%vs23, %%vs62 \n\t" \ +"xvmuldp %%vs24, %%vs24, %%vs62 \n\t" \ +"xvmuldp %%vs25, %%vs25, %%vs62 \n\t" \ +"xvmuldp %%vs26, %%vs26, %%vs62 \n\t" \ +"xvmuldp %%vs27, %%vs27, %%vs62 \n\t" \ +"xvmuldp %%vs28, %%vs28, %%vs62 \n\t" \ +"xvmuldp %%vs29, %%vs29, %%vs62 \n\t" \ +"xvmuldp %%vs30, %%vs30, %%vs62 \n\t" \ +"xvmuldp %%vs31, %%vs31, %%vs62 \n\t" \ +"xvmuldp %%vs32, %%vs32, %%vs62 \n\t" \ +"xvmuldp %%vs33, %%vs33, %%vs62 \n\t" \ +"xvmuldp %%vs34, %%vs34, %%vs62 \n\t" \ +"xvmuldp %%vs35, %%vs35, %%vs62 \n\t" + +// initialize offset registers used for gen stored cases +#define DGEN_LOAD_OFS_C \ +"ld %%r22, %6 \n\t" \ +"slwi %%r12, %%r9, 1 \n\t" \ +"add %%r23, %%r22, %%r12 \n\t" \ +"add %%r24, %%r23, %%r12 \n\t" \ +"add %%r25, %%r24, %%r12 \n\t" \ +"add %%r26, %%r25, %%r12 \n\t" \ +"add %%r27, %%r26, %%r12 \n\t" + +// load C into registers +// assume C is gen stored +#define DGEN_LOAD_C \ +"lxsdx %%vs36, %%r9, %%r22 \n\t" \ +"lxsdx %%vs37, 0, %%r22 \n\t" \ +"xxpermdi %%vs36, %%vs36, %%vs37, 0 \n\t" \ +"lxsdx %%vs37, %%r9, %%r23 \n\t" \ +"lxsdx %%vs38, 0, %%r23 \n\t" \ +"xxpermdi %%vs37, %%vs37, %%vs38, 0 \n\t" \ +"lxsdx %%vs38, %%r9, %%r24 \n\t" \ +"lxsdx %%vs39, 0, %%r24 \n\t" \ +"xxpermdi %%vs38, %%vs38, %%vs39, 0 \n\t" \ +"lxsdx %%vs39, %%r9, %%r25 \n\t" \ +"lxsdx %%vs40, 0, %%r25 \n\t" \ +"xxpermdi %%vs39, %%vs39, %%vs40, 0 \n\t" \ +"lxsdx %%vs40, %%r9, %%r26 \n\t" \ +"lxsdx %%vs41, 0, %%r26 \n\t" \ +"xxpermdi %%vs40, %%vs40, %%vs41, 0 \n\t" \ +"lxsdx %%vs41, %%r9, %%r27 \n\t" \ +"lxsdx %%vs42, 0, %%r27 \n\t" \ +"xxpermdi %%vs41, %%vs41, %%vs42, 0 \n\t" + +// increment offset registers to the next col +#define DGEN_NEXT_COL_CMATRIX \ +"add %%r22, %%r22, %%r10 \n\t" \ +"add %%r23, %%r23, %%r10 \n\t" \ +"add %%r24, %%r24, %%r10 \n\t" \ +"add %%r25, %%r25, %%r10 \n\t" \ +"add %%r26, %%r26, %%r10 \n\t" \ +"add %%r27, %%r27, %%r10 \n\t" + +// load C into registers and move offset registers to next col +#define DGENLOAD_UPDATE \ +DGEN_LOAD_C \ +DGEN_NEXT_COL_CMATRIX + +// scale C by beta and add it to the AB product +// assume C is gen stored +#define DGEN_SCALE_BETA \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs63 \n\t" \ +" \n\t" \ +" \n\t" \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs6, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs7, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs8, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs9, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs10, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs11, %%vs41, %%vs63 \n\t" \ +" \n\t" \ +" \n\t" \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs12, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs13, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs14, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs15, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs16, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs17, %%vs41, %%vs63 \n\t" \ +" \n\t" \ +" \n\t" \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs63 \n\t" \ +" \n\t" \ +" \n\t" \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs24, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs25, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs26, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs27, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs28, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs29, %%vs41, %%vs63 \n\t" \ +" \n\t" \ +" \n\t" \ +DGENLOAD_UPDATE \ +" \n\t" \ +"xvmaddadp %%vs30, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs31, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs32, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs33, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs34, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs35, %%vs41, %%vs63 \n\t" + +// scale C by beta and add it to the AB product +// assume C is col stored +#define DCOL_SCALE_BETA \ +"lxv %%vs36, 0(%%r16) \n\t" \ +"lxv %%vs42, 0(%%r17) \n\t" \ +"lxv %%vs48, 0(%%r18) \n\t" \ +"lxv %%vs41, 80(%%r16) \n\t" \ +"lxv %%vs47, 80(%%r17) \n\t" \ +"lxv %%vs53, 80(%%r18) \n\t" \ +"lxv %%vs37, 16(%%r16) \n\t" \ +"lxv %%vs38, 32(%%r16) \n\t" \ +"lxv %%vs39, 48(%%r16) \n\t" \ +"lxv %%vs40, 64(%%r16) \n\t" \ +"lxv %%vs43, 16(%%r17) \n\t" \ +"lxv %%vs44, 32(%%r17) \n\t" \ +"lxv %%vs45, 48(%%r17) \n\t" \ +"lxv %%vs46, 64(%%r17) \n\t" \ +"lxv %%vs49, 16(%%r18) \n\t" \ +"lxv %%vs50, 32(%%r18) \n\t" \ +"lxv %%vs51, 48(%%r18) \n\t" \ +"lxv %%vs52, 64(%%r18) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs0, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs6, %%vs42, %%vs63 \n\t" \ +"xvmaddadp %%vs12, %%vs48, %%vs63 \n\t" \ +"xvmaddadp %%vs5, %%vs41, %%vs63 \n\t" \ +"xvmaddadp %%vs11, %%vs47, %%vs63 \n\t" \ +"xvmaddadp %%vs17, %%vs53, %%vs63 \n\t" \ +"xvmaddadp %%vs1, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs2, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs3, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs4, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs7, %%vs43, %%vs63 \n\t" \ +"xvmaddadp %%vs8, %%vs44, %%vs63 \n\t" \ +"xvmaddadp %%vs9, %%vs45, %%vs63 \n\t" \ +"xvmaddadp %%vs10, %%vs46, %%vs63 \n\t" \ +"xvmaddadp %%vs13, %%vs49, %%vs63 \n\t" \ +"xvmaddadp %%vs14, %%vs50, %%vs63 \n\t" \ +"xvmaddadp %%vs15, %%vs51, %%vs63 \n\t" \ +"xvmaddadp %%vs16, %%vs52, %%vs63 \n\t" \ +" \n\t" \ +"lxv %%vs36, 0(%%r19) \n\t" \ +"lxv %%vs42, 0(%%r20) \n\t" \ +"lxv %%vs48, 0(%%r21) \n\t" \ +"lxv %%vs41, 80(%%r19) \n\t" \ +"lxv %%vs47, 80(%%r20) \n\t" \ +"lxv %%vs53, 80(%%r21) \n\t" \ +"lxv %%vs37, 16(%%r19) \n\t" \ +"lxv %%vs38, 32(%%r19) \n\t" \ +"lxv %%vs39, 48(%%r19) \n\t" \ +"lxv %%vs40, 64(%%r19) \n\t" \ +"lxv %%vs43, 16(%%r20) \n\t" \ +"lxv %%vs44, 32(%%r20) \n\t" \ +"lxv %%vs45, 48(%%r20) \n\t" \ +"lxv %%vs46, 64(%%r20) \n\t" \ +"lxv %%vs49, 16(%%r21) \n\t" \ +"lxv %%vs50, 32(%%r21) \n\t" \ +"lxv %%vs51, 48(%%r21) \n\t" \ +"lxv %%vs52, 64(%%r21) \n\t" \ +" \n\t" \ +"xvmaddadp %%vs18, %%vs36, %%vs63 \n\t" \ +"xvmaddadp %%vs24, %%vs42, %%vs63 \n\t" \ +"xvmaddadp %%vs30, %%vs48, %%vs63 \n\t" \ +"xvmaddadp %%vs23, %%vs41, %%vs63 \n\t" \ +"xvmaddadp %%vs29, %%vs47, %%vs63 \n\t" \ +"xvmaddadp %%vs35, %%vs53, %%vs63 \n\t" \ +"xvmaddadp %%vs19, %%vs37, %%vs63 \n\t" \ +"xvmaddadp %%vs20, %%vs38, %%vs63 \n\t" \ +"xvmaddadp %%vs21, %%vs39, %%vs63 \n\t" \ +"xvmaddadp %%vs22, %%vs40, %%vs63 \n\t" \ +"xvmaddadp %%vs25, %%vs43, %%vs63 \n\t" \ +"xvmaddadp %%vs26, %%vs44, %%vs63 \n\t" \ +"xvmaddadp %%vs27, %%vs45, %%vs63 \n\t" \ +"xvmaddadp %%vs28, %%vs46, %%vs63 \n\t" \ +"xvmaddadp %%vs31, %%vs49, %%vs63 \n\t" \ +"xvmaddadp %%vs32, %%vs50, %%vs63 \n\t" \ +"xvmaddadp %%vs33, %%vs51, %%vs63 \n\t" \ +"xvmaddadp %%vs34, %%vs52, %%vs63 \n\t" + +// store result into C's memory location +// assume C is gen stored +#define DGEN_STORE \ +" \n\t" \ +"stxsdx %%vs0, %%r9, %%r22 \n\t" \ +"xxswapd %%vs0, %%vs0 \n\t" \ +"stxsdx %%vs0, 0, %%r22 \n\t" \ +"stxsdx %%vs1, %%r9, %%r23 \n\t" \ +"xxswapd %%vs1, %%vs1 \n\t" \ +"stxsdx %%vs1, 0, %%r23 \n\t" \ +"stxsdx %%vs2, %%r9, %%r24 \n\t" \ +"xxswapd %%vs2, %%vs2 \n\t" \ +"stxsdx %%vs2, 0, %%r24 \n\t" \ +"stxsdx %%vs3, %%r9, %%r25 \n\t" \ +"xxswapd %%vs3, %%vs3 \n\t" \ +"stxsdx %%vs3, 0, %%r25 \n\t" \ +"stxsdx %%vs4, %%r9, %%r26 \n\t" \ +"xxswapd %%vs4, %%vs4 \n\t" \ +"stxsdx %%vs4, 0, %%r26 \n\t" \ +"stxsdx %%vs5, %%r9, %%r27 \n\t" \ +"xxswapd %%vs5, %%vs5 \n\t" \ +"stxsdx %%vs5, 0, %%r27 \n\t" \ +" \n\t" \ +DGEN_NEXT_COL_CMATRIX \ +" \n\t" \ +"stxsdx %%vs6, %%r9, %%r22 \n\t" \ +"xxswapd %%vs6, %%vs6 \n\t" \ +"stxsdx %%vs6, 0, %%r22 \n\t" \ +"stxsdx %%vs7, %%r9, %%r23 \n\t" \ +"xxswapd %%vs7, %%vs7 \n\t" \ +"stxsdx %%vs7, 0, %%r23 \n\t" \ +"stxsdx %%vs8, %%r9, %%r24 \n\t" \ +"xxswapd %%vs8, %%vs8 \n\t" \ +"stxsdx %%vs8, 0, %%r24 \n\t" \ +"stxsdx %%vs9, %%r9, %%r25 \n\t" \ +"xxswapd %%vs9, %%vs9 \n\t" \ +"stxsdx %%vs9, 0, %%r25 \n\t" \ +"stxsdx %%vs10, %%r9, %%r26 \n\t" \ +"xxswapd %%vs10, %%vs10 \n\t" \ +"stxsdx %%vs10, 0, %%r26 \n\t" \ +"stxsdx %%vs11, %%r9, %%r27 \n\t" \ +"xxswapd %%vs11, %%vs11 \n\t" \ +"stxsdx %%vs11, 0, %%r27 \n\t" \ +" \n\t" \ +DGEN_NEXT_COL_CMATRIX \ +" \n\t" \ +"stxsdx %%vs12, %%r9, %%r22 \n\t" \ +"xxswapd %%vs12, %%vs12 \n\t" \ +"stxsdx %%vs12, 0, %%r22 \n\t" \ +"stxsdx %%vs13, %%r9, %%r23 \n\t" \ +"xxswapd %%vs13, %%vs13 \n\t" \ +"stxsdx %%vs13, 0, %%r23 \n\t" \ +"stxsdx %%vs14, %%r9, %%r24 \n\t" \ +"xxswapd %%vs14, %%vs14 \n\t" \ +"stxsdx %%vs14, 0, %%r24 \n\t" \ +"stxsdx %%vs15, %%r9, %%r25 \n\t" \ +"xxswapd %%vs15, %%vs15 \n\t" \ +"stxsdx %%vs15, 0, %%r25 \n\t" \ +"stxsdx %%vs16, %%r9, %%r26 \n\t" \ +"xxswapd %%vs16, %%vs16 \n\t" \ +"stxsdx %%vs16, 0, %%r26 \n\t" \ +"stxsdx %%vs17, %%r9, %%r27 \n\t" \ +"xxswapd %%vs17, %%vs17 \n\t" \ +"stxsdx %%vs17, 0, %%r27 \n\t" \ +" \n\t" \ +DGEN_NEXT_COL_CMATRIX \ +" \n\t" \ +"stxsdx %%vs18, %%r9, %%r22 \n\t" \ +"xxswapd %%vs18, %%vs18 \n\t" \ +"stxsdx %%vs18, 0, %%r22 \n\t" \ +"stxsdx %%vs19, %%r9, %%r23 \n\t" \ +"xxswapd %%vs19, %%vs19 \n\t" \ +"stxsdx %%vs19, 0, %%r23 \n\t" \ +"stxsdx %%vs20, %%r9, %%r24 \n\t" \ +"xxswapd %%vs20, %%vs20 \n\t" \ +"stxsdx %%vs20, 0, %%r24 \n\t" \ +"stxsdx %%vs21, %%r9, %%r25 \n\t" \ +"xxswapd %%vs21, %%vs21 \n\t" \ +"stxsdx %%vs21, 0, %%r25 \n\t" \ +"stxsdx %%vs22, %%r9, %%r26 \n\t" \ +"xxswapd %%vs22, %%vs22 \n\t" \ +"stxsdx %%vs22, 0, %%r26 \n\t" \ +"stxsdx %%vs23, %%r9, %%r27 \n\t" \ +"xxswapd %%vs23, %%vs23 \n\t" \ +"stxsdx %%vs23, 0, %%r27 \n\t" \ +" \n\t" \ +DGEN_NEXT_COL_CMATRIX \ +" \n\t" \ +"stxsdx %%vs24, %%r9, %%r22 \n\t" \ +"xxswapd %%vs24, %%vs24 \n\t" \ +"stxsdx %%vs24, 0, %%r22 \n\t" \ +"stxsdx %%vs25, %%r9, %%r23 \n\t" \ +"xxswapd %%vs25, %%vs25 \n\t" \ +"stxsdx %%vs25, 0, %%r23 \n\t" \ +"stxsdx %%vs26, %%r9, %%r24 \n\t" \ +"xxswapd %%vs26, %%vs26 \n\t" \ +"stxsdx %%vs26, 0, %%r24 \n\t" \ +"stxsdx %%vs27, %%r9, %%r25 \n\t" \ +"xxswapd %%vs27, %%vs27 \n\t" \ +"stxsdx %%vs27, 0, %%r25 \n\t" \ +"stxsdx %%vs28, %%r9, %%r26 \n\t" \ +"xxswapd %%vs28, %%vs28 \n\t" \ +"stxsdx %%vs28, 0, %%r26 \n\t" \ +"stxsdx %%vs29, %%r9, %%r27 \n\t" \ +"xxswapd %%vs29, %%vs29 \n\t" \ +"stxsdx %%vs29, 0, %%r27 \n\t" \ +" \n\t" \ +DGEN_NEXT_COL_CMATRIX \ +" \n\t" \ +"stxsdx %%vs30, %%r9, %%r22 \n\t" \ +"xxswapd %%vs30, %%vs30 \n\t" \ +"stxsdx %%vs30, 0, %%r22 \n\t" \ +"stxsdx %%vs31, %%r9, %%r23 \n\t" \ +"xxswapd %%vs31, %%vs31 \n\t" \ +"stxsdx %%vs31, 0, %%r23 \n\t" \ +"stxsdx %%vs32, %%r9, %%r24 \n\t" \ +"xxswapd %%vs32, %%vs32 \n\t" \ +"stxsdx %%vs32, 0, %%r24 \n\t" \ +"stxsdx %%vs33, %%r9, %%r25 \n\t" \ +"xxswapd %%vs33, %%vs33 \n\t" \ +"stxsdx %%vs33, 0, %%r25 \n\t" \ +"stxsdx %%vs34, %%r9, %%r26 \n\t" \ +"xxswapd %%vs34, %%vs34 \n\t" \ +"stxsdx %%vs34, 0, %%r26 \n\t" \ +"stxsdx %%vs35, %%r9, %%r27 \n\t" \ +"xxswapd %%vs35, %%vs35 \n\t" \ +"stxsdx %%vs35, 0, %%r27 \n\t" + +// store result into C's memory location +// assume C is col stored +#define DCOL_STORE \ +"stxv %%vs0, 0(%%r16) \n\t" \ +"stxv %%vs1, 16(%%r16) \n\t" \ +"stxv %%vs2, 32(%%r16) \n\t" \ +"stxv %%vs3, 48(%%r16) \n\t" \ +"stxv %%vs4, 64(%%r16) \n\t" \ +"stxv %%vs5, 80(%%r16) \n\t" \ +"stxv %%vs6, 0(%%r17) \n\t" \ +"stxv %%vs7, 16(%%r17) \n\t" \ +"stxv %%vs8, 32(%%r17) \n\t" \ +"stxv %%vs9, 48(%%r17) \n\t" \ +"stxv %%vs10, 64(%%r17) \n\t" \ +"stxv %%vs11, 80(%%r17) \n\t" \ +"stxv %%vs12, 0(%%r18) \n\t" \ +"stxv %%vs13, 16(%%r18) \n\t" \ +"stxv %%vs14, 32(%%r18) \n\t" \ +"stxv %%vs15, 48(%%r18) \n\t" \ +"stxv %%vs16, 64(%%r18) \n\t" \ +"stxv %%vs17, 80(%%r18) \n\t" \ +"stxv %%vs18, 0(%%r19) \n\t" \ +"stxv %%vs19, 16(%%r19) \n\t" \ +"stxv %%vs20, 32(%%r19) \n\t" \ +"stxv %%vs21, 48(%%r19) \n\t" \ +"stxv %%vs22, 64(%%r19) \n\t" \ +"stxv %%vs23, 80(%%r19) \n\t" \ +"stxv %%vs24, 0(%%r20) \n\t" \ +"stxv %%vs25, 16(%%r20) \n\t" \ +"stxv %%vs26, 32(%%r20) \n\t" \ +"stxv %%vs27, 48(%%r20) \n\t" \ +"stxv %%vs28, 64(%%r20) \n\t" \ +"stxv %%vs29, 80(%%r20) \n\t" \ +"stxv %%vs30, 0(%%r21) \n\t" \ +"stxv %%vs31, 16(%%r21) \n\t" \ +"stxv %%vs32, 32(%%r21) \n\t" \ +"stxv %%vs33, 48(%%r21) \n\t" \ +"stxv %%vs34, 64(%%r21) \n\t" \ +"stxv %%vs35, 80(%%r21) \n\t" + diff --git a/kernels/power9/bli_kernels_power9.h b/kernels/power9/bli_kernels_power9.h new file mode 100644 index 000000000..72391a837 --- /dev/null +++ b/kernels/power9/bli_kernels_power9.h @@ -0,0 +1,47 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// -- level-3 -- + +// gemm (asm d12x6) +GEMM_UKR_PROT( double, d, gemm_power9_asm_12x6 ) + +// gemm (asm d18x4) +GEMM_UKR_PROT( double, d, gemm_power9_asm_18x4 ) + +// gemm (asm d4x16) +GEMM_UKR_PROT( double, d, gemm_power9_asm_16x4 ) + +// gemm (asm d4x16) +GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 ) \ No newline at end of file diff --git a/test/3/Makefile b/test/3/Makefile index 38d915721..90ba6d901 100644 --- a/test/3/Makefile +++ b/test/3/Makefile @@ -130,9 +130,9 @@ VENDORP_LIB := $(MKLP_LIB) # # Single core (single-threaded) -PS_BEGIN := 48 -PS_MAX := 2400 -PS_INC := 48 +PS_BEGIN := 100 +PS_MAX := 1000 +PS_INC := 100 # Single-socket (multithreaded) P1_BEGIN := 96 @@ -242,8 +242,8 @@ blis-2s: blis-nat-2s blis-nat: blis-nat-st blis-nat-1s blis-nat-2s # Define the datatypes, operations, and implementations. -DTS := s d c z -OPS := gemm hemm herk trmm trsm +DTS := d # s d c z +OPS := gemm # hemm herk trmm trsm BIMPLS := asm_blis openblas vendor EIMPLS := eigen diff --git a/test/3/Makefile_cpy1 b/test/3/Makefile_cpy1 new file mode 100644 index 000000000..cfbce1abd --- /dev/null +++ b/test/3/Makefile_cpy1 @@ -0,0 +1,464 @@ +#!/bin/bash +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# Copyright (C) 2018, Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +# +# Makefile +# +# Field G. Van Zee +# +# Makefile for standalone BLIS test drivers. +# + +# +# --- Makefile PHONY target definitions ---------------------------------------- +# + +.PHONY: all \ + clean cleanx + + + +# +# --- Determine makefile fragment location ------------------------------------- +# + +# Comments: +# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given. +# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in +# the second case because CONFIG_NAME is not yet set. +ifneq ($(strip $(BLIS_INSTALL_PATH)),) +LIB_PATH := $(BLIS_INSTALL_PATH)/lib +INC_PATH := $(BLIS_INSTALL_PATH)/include/blis +SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis +else +DIST_PATH := ../.. +LIB_PATH = ../../lib/$(CONFIG_NAME) +INC_PATH = ../../include/$(CONFIG_NAME) +SHARE_PATH := ../.. +endif + + + +# +# --- Include common makefile definitions -------------------------------------- +# + +# Include the common makefile fragment. +-include $(SHARE_PATH)/common.mk + + + +# +# --- BLAS implementations ----------------------------------------------------- +# + +# BLAS library path(s). This is where the BLAS libraries reside. +HOME_LIB_PATH := $(HOME)/flame/lib +#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 + +# OpenBLAS +OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a +# OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a + +# ATLAS +#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \ +# $(HOME_LIB_PATH)/libatlas.a + +# Eigen +EIGEN_INC := $(HOME)/flame/eigen/include/eigen3 +EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a +EIGENP_LIB := $(EIGEN_LIB) + +# MKL + MKL_LIB := -L$(MKL_LIB_PATH) \ + -lmkl_intel_lp64 \ + -lmkl_core \ + -lmkl_sequential \ + -lpthread -lm -ldl +#MKLP_LIB := -L$(MKL_LIB_PATH) \ +# -lmkl_intel_thread \ +# -lmkl_core \ +# -lmkl_intel_ilp64 \ +# -L$(ICC_LIB_PATH) \ +# -liomp5 +# MKLP_LIB := -L$(MKL_LIB_PATH) \ +# -lmkl_intel_lp64 \ +# -lmkl_core \ +# -lmkl_gnu_thread \ +# -lpthread -lm -ldl -fopenmp +# #-L$(ICC_LIB_PATH) \ +# #-lgomp + +VENDOR_LIB := $(MKL_LIB) +VENDORP_LIB := $(MKLP_LIB) + + +# +# --- Problem size definitions ------------------------------------------------- +# + +# Single core (single-threaded) +PS_BEGIN := 100 +PS_MAX := 1000 +PS_INC := 100 + +# Single-socket (multithreaded) +P1_BEGIN := 120 +P1_MAX := 6000 +P1_INC := 120 + +# Dual-socket (multithreaded) +P2_BEGIN := 160 +P2_MAX := 8000 +P2_INC := 160 + + +# +# --- General build definitions ------------------------------------------------ +# + +TEST_SRC_PATH := . +TEST_OBJ_PATH := . + +# Gather all local object files. +TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \ + $(TEST_OBJ_PATH)/%.o, \ + $(wildcard $(TEST_SRC_PATH)/*.c))) + +# Override the value of CINCFLAGS so that the value of CFLAGS returned by +# get-user-cflags-for() is not cluttered up with include paths needed only +# while building BLIS. +CINCFLAGS := -I$(INC_PATH) + +# Use the "framework" CFLAGS for the configuration family. +CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME)) + +# Add local header paths to CFLAGS. +CFLAGS += -I$(TEST_SRC_PATH) + +# Locate the libblis library to which we will link. +#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L) + +# Define a set of CFLAGS for use with C++ and Eigen. +CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS)) +CXXFLAGS += -I$(EIGEN_INC) + +# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading. +CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS)) +CXXFLAGS_MT := -march=native $(CXXFLAGS) + + +# Which library? +BLI_DEF := -DBLIS +BLA_DEF := -DBLAS +EIG_DEF := -DEIGEN + +# Complex implementation type +D3MHW := -DIND=BLIS_3MH +D3M1 := -DIND=BLIS_3M1 +D4MHW := -DIND=BLIS_4MH +D4M1B := -DIND=BLIS_4M1B +D4M1A := -DIND=BLIS_4M1A +D1M := -DIND=BLIS_1M +DNAT := -DIND=BLIS_NAT + +# Implementation string +#STR_3MHW := -DSTR=\"3mhw\" +#STR_3M1 := -DSTR=\"3m1\" +#STR_4MHW := -DSTR=\"4mhw\" +#STR_4M1B := -DSTR=\"4m1b\" +#STR_4M1A := -DSTR=\"4m1a\" +#STR_1M := -DSTR=\"1m\" +STR_NAT := -DSTR=\"asm_blis\" +STR_OBL := -DSTR=\"openblas\" +STR_EIG := -DSTR=\"eigen\" +STR_VEN := -DSTR=\"vendor\" + +# Single or multithreaded string +STR_ST := -DTHR_STR=\"st\" +STR_1S := -DTHR_STR=\"1s\" +STR_2S := -DTHR_STR=\"2s\" + +# Problem size specification +PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX) +PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX) +PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX) + + + +# +# --- Targets/rules ------------------------------------------------------------ +# + +all: all-st all-1s all-2s +blis: blis-st blis-1s blis-2s +openblas: openblas-st openblas-1s openblas-2s +eigen: eigen-st eigen-1s eigen-2s +vendor: vendor-st vendor-1s vendor-2s +mkl: vendor +armpl: vendor + +all-st: blis-st openblas-st mkl-st +all-1s: blis-1s openblas-1s mkl-1s +all-2s: blis-2s openblas-2s mkl-2s + +blis-st: blis-nat-st +blis-1s: blis-nat-1s +blis-2s: blis-nat-2s + +#blis-ind: blis-ind-st blis-ind-mt +blis-nat: blis-nat-st blis-nat-1s blis-nat-2s + +# Define the datatypes, operations, and implementations. +DTS := d #s d c z +OPS := gemm #hemm herk trmm trsm +IMPLS := asm_blis openblas vendor + +# Define functions to construct object filenames from the datatypes and +# operations given an implementation. We define one function for single- +# threaded, single-socket, and dual-socket filenames. +get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o)) +get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o)) +get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o)) + +# Construct object and binary names for single-threaded, single-socket, and +# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL). +BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis) +BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS)) +BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis) +BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS)) +BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis) +BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS)) + +OPENBLAS_ST_OBJS := $(call get-st-objs,openblas) +OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS)) +OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas) +OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS)) +OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas) +OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS)) + +EIGEN_ST_OBJS := $(call get-st-objs,eigen) +EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS)) +EIGEN_1S_OBJS := $(call get-1s-objs,eigen) +EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS)) +EIGEN_2S_OBJS := $(call get-2s-objs,eigen) +EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS)) + +VENDOR_ST_OBJS := $(call get-st-objs,vendor) +VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS)) +VENDOR_1S_OBJS := $(call get-1s-objs,vendor) +VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS)) +VENDOR_2S_OBJS := $(call get-2s-objs,vendor) +VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS)) + +# Define some targets associated with the above object/binary files. +blis-nat-st: $(BLIS_NAT_ST_BINS) +blis-nat-1s: $(BLIS_NAT_1S_BINS) +blis-nat-2s: $(BLIS_NAT_2S_BINS) + +openblas-st: $(OPENBLAS_ST_BINS) +openblas-1s: $(OPENBLAS_1S_BINS) +openblas-2s: $(OPENBLAS_2S_BINS) + +eigen-st: $(EIGEN_ST_BINS) +eigen-1s: $(EIGEN_1S_BINS) +eigen-2s: $(EIGEN_2S_BINS) + +vendor-st: $(VENDOR_ST_BINS) +vendor-1s: $(VENDOR_1S_BINS) +vendor-2s: $(VENDOR_2S_BINS) + +mkl-st: vendor-st +mkl-1s: vendor-1s +mkl-2s: vendor-2s + +armpl-st: vendor-st +armpl-1s: vendor-1s +armpl-2s: vendor-2s + +# Mark the object files as intermediate so that make will remove them +# automatically after building the binaries on which they depend. +.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS) +.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS) +.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS) +.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS) + + +# --Object file rules -- + +#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c +# $(CC) $(CFLAGS) -c $< -o $@ + +# A function to return the datatype cpp macro def from the datatype +# character. +get-dt-cpp = $(strip \ + $(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\ + $(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\ + $(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\ + -DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX)))) + +# A function to return other cpp macros that help the test driver +# identify the implementation. +#get-bl-cpp = $(strip \ +# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\ +# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ +# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\ +# $(STR_VEN) $(BLA_DEF))))) + +get-bl-cpp = $(strip \ + $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\ + $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\ + $(if $(and $(findstring eigen,$(1)),\ + $(findstring gemm,$(2))),\ + $(STR_EIG) $(EIG_DEF),\ + $(if $(findstring eigen,$(1)),\ + $(STR_EIG) $(BLA_DEF),\ + $(STR_VEN) $(BLA_DEF)))))) + + +# Rules for BLIS and BLAS libraries. +define make-st-rule +test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@ +endef + +define make-1s-rule +test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@ +endef + +define make-2s-rule +test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile + $(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@ +endef + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im)))))) + +# Rules for Eigen. +define make-eigst-rule +test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@ +endef + +define make-eig1s-rule +test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@ +endef + +define make-eig2s-rule +test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile + $(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@ +endef + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im)))))) + +$(foreach dt,$(DTS), \ +$(foreach op,$(OPS), \ +$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im)))))) + + +# -- Executable file rules -- + +# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS +# on the link command line in case BLIS was configured with the BLAS +# compatibility layer. This prevents BLIS from inadvertently getting called +# for the BLAS routines we are trying to test with. + +test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK) + $(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + +test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK) + $(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@) + + +# -- Clean rules -- + +clean: cleanx + +cleanx: + - $(RM_F) *.o *.x + diff --git a/test/3/runme.sh b/test/3/runme.sh index 9933dd1e5..ed917ffc1 100755 --- a/test/3/runme.sh +++ b/test/3/runme.sh @@ -65,16 +65,15 @@ elif [ ${sys} = "ul264" ]; then fi # Datatypes to test. -test_dts="d s z c" +test_dts="d " #s z c" # Operations to test. -test_ops="gemm hemm herk trmm trsm" +test_ops="gemm "#hemm herk trmm trsm" # Implementations to test. -#impls="blis" -#impls="other" -impls="eigen" #impls="all" +#impls="other" +impls="blis" if [ "${impls}" = "blis" ]; then diff --git a/test/3/test_gemm.c b/test/3/test_gemm.c index 5ff4c0c0f..18505065a 100644 --- a/test/3/test_gemm.c +++ b/test/3/test_gemm.c @@ -1,418 +1,418 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include -#ifdef EIGEN - #define BLIS_DISABLE_BLAS_DEFS - #include "blis.h" - #include - #include - using namespace Eigen; -#else - #include "blis.h" -#endif - -#define COL_STORAGE -//#define ROW_STORAGE - -//#define PRINT - -int main( int argc, char** argv ) -{ - obj_t a, b, c; - obj_t c_save; - obj_t alpha, beta; - dim_t m, n, k; - dim_t p; - dim_t p_begin, p_max, p_inc; - int m_input, n_input, k_input; - ind_t ind; - num_t dt; - char dt_ch; - int r, n_repeats; - trans_t transa; - trans_t transb; - f77_char f77_transa; - f77_char f77_transb; - - double dtime; - double dtime_save; - double gflops; - - //bli_init(); - - //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); - - n_repeats = 3; - - dt = DT; - - ind = IND; - -#if 1 - p_begin = P_BEGIN; - p_max = P_MAX; - p_inc = P_INC; - - m_input = -1; - n_input = -1; - k_input = -1; -#else - p_begin = 40; - p_max = 1000; - p_inc = 40; - - m_input = -1; - n_input = -1; - k_input = -1; -#endif - - - // Supress compiler warnings about unused variable 'ind'. - ( void )ind; - -#if 0 - - cntx_t* cntx; - - ind_t ind_mod = ind; - - // A hack to use 3m1 as 1mpb (with 1m as 1mbp). - if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; - - // Initialize a context for the current induced method and datatype. - cntx = bli_gks_query_ind_cntx( ind_mod, dt ); - - // Set k to the kc blocksize for the current datatype. - k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); - -#elif 1 - - //k_input = 256; - -#endif - - // Choose the char corresponding to the requested datatype. - if ( bli_is_float( dt ) ) dt_ch = 's'; - else if ( bli_is_double( dt ) ) dt_ch = 'd'; - else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; - else dt_ch = 'z'; - - transa = BLIS_NO_TRANSPOSE; - transb = BLIS_NO_TRANSPOSE; - - bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); - bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); - - // Begin with initializing the last entry to zero so that - // matlab allocates space for the entire array once up-front. - for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; - - printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin)/p_inc + 1, - ( unsigned long )0, - ( unsigned long )0, - ( unsigned long )0, 0.0 ); - - - //for ( p = p_begin; p <= p_max; p += p_inc ) - for ( p = p_max; p_begin <= p; p -= p_inc ) - { - - if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); - else m = ( dim_t ) m_input; - if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; - if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); - else k = ( dim_t ) k_input; - - bli_obj_create( dt, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt, 1, 1, 0, 0, &beta ); - - #ifdef COL_STORAGE - bli_obj_create( dt, m, k, 0, 0, &a ); - bli_obj_create( dt, k, n, 0, 0, &b ); - bli_obj_create( dt, m, n, 0, 0, &c ); - bli_obj_create( dt, m, n, 0, 0, &c_save ); - #else - bli_obj_create( dt, m, k, k, 1, &a ); - bli_obj_create( dt, k, n, n, 1, &b ); - bli_obj_create( dt, m, n, n, 1, &c ); - bli_obj_create( dt, m, n, n, 1, &c_save ); - #endif - - bli_randm( &a ); - bli_randm( &b ); - bli_randm( &c ); - - bli_obj_set_conjtrans( transa, &a ); - bli_obj_set_conjtrans( transb, &b ); - - bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_setsc( (1.0/1.0), 0.0, &beta ); - - bli_copym( &c, &c_save ); - -#if 0 //def BLIS - bli_ind_disable_all_dt( dt ); - bli_ind_enable_dt( ind, dt ); -#endif - -#ifdef EIGEN - double alpha_r, alpha_i; - - bli_getsc( &alpha, &alpha_r, &alpha_i ); - - void* ap = bli_obj_buffer_at_off( &a ); - void* bp = bli_obj_buffer_at_off( &b ); - void* cp = bli_obj_buffer_at_off( &c ); - - #ifdef COL_STORAGE - const int os_a = bli_obj_col_stride( &a ); - const int os_b = bli_obj_col_stride( &b ); - const int os_c = bli_obj_col_stride( &c ); - #else - const int os_a = bli_obj_row_stride( &a ); - const int os_b = bli_obj_row_stride( &b ); - const int os_c = bli_obj_row_stride( &c ); - #endif - - Stride stride_a( os_a, 1 ); - Stride stride_b( os_b, 1 ); - Stride stride_c( os_c, 1 ); - - #ifdef COL_STORAGE - #if defined(IS_FLOAT) - typedef Matrix MatrixXf_; - #elif defined (IS_DOUBLE) - typedef Matrix MatrixXd_; - #elif defined (IS_SCOMPLEX) - typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcf_; - #elif defined (IS_DCOMPLEX) - typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcd_; - #endif - #else - #if defined(IS_FLOAT) - typedef Matrix MatrixXf_; - #elif defined (IS_DOUBLE) - typedef Matrix MatrixXd_; - #elif defined (IS_SCOMPLEX) - typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcf_; - #elif defined (IS_DCOMPLEX) - typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcd_; - #endif - #endif - #if defined(IS_FLOAT) - Map > A( ( float* )ap, m, k, stride_a ); - Map > B( ( float* )bp, k, n, stride_b ); - Map > C( ( float* )cp, m, n, stride_c ); - #elif defined (IS_DOUBLE) - Map > A( ( double* )ap, m, k, stride_a ); - Map > B( ( double* )bp, k, n, stride_b ); - Map > C( ( double* )cp, m, n, stride_c ); - #elif defined (IS_SCOMPLEX) - Map > A( ( std::complex* )ap, m, k, stride_a ); - Map > B( ( std::complex* )bp, k, n, stride_b ); - Map > C( ( std::complex* )cp, m, n, stride_c ); - #elif defined (IS_DCOMPLEX) - Map > A( ( std::complex* )ap, m, k, stride_a ); - Map > B( ( std::complex* )bp, k, n, stride_b ); - Map > C( ( std::complex* )cp, m, n, stride_c ); - #endif -#endif - - dtime_save = DBL_MAX; - - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &c_save, &c ); - - dtime = bli_clock(); - -#ifdef PRINT - bli_printm( "a", &a, "%4.1f", "" ); - bli_printm( "b", &b, "%4.1f", "" ); - bli_printm( "c", &c, "%4.1f", "" ); -#endif - -#if defined(BLIS) - - bli_gemm( &alpha, - &a, - &b, - &beta, - &c ); - -#elif defined(EIGEN) - - C.noalias() += alpha_r * A * B; - -#else // if defined(BLAS) - - if ( bli_is_float( dt ) ) - { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - float* alphap = ( float* )bli_obj_buffer( &alpha ); - float* ap = ( float* )bli_obj_buffer( &a ); - float* bp = ( float* )bli_obj_buffer( &b ); - float* betap = ( float* )bli_obj_buffer( &beta ); - float* cp = ( float* )bli_obj_buffer( &c ); - - sgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } - else if ( bli_is_double( dt ) ) - { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - double* alphap = ( double* )bli_obj_buffer( &alpha ); - double* ap = ( double* )bli_obj_buffer( &a ); - double* bp = ( double* )bli_obj_buffer( &b ); - double* betap = ( double* )bli_obj_buffer( &beta ); - double* cp = ( double* )bli_obj_buffer( &c ); - - dgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } - else if ( bli_is_scomplex( dt ) ) - { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); - scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); - scomplex* bp = ( scomplex* )bli_obj_buffer( &b ); - scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); - scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); - - cgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } - else if ( bli_is_dcomplex( dt ) ) - { - f77_int mm = bli_obj_length( &c ); - f77_int kk = bli_obj_width_after_trans( &a ); - f77_int nn = bli_obj_width( &c ); - f77_int lda = bli_obj_col_stride( &a ); - f77_int ldb = bli_obj_col_stride( &b ); - f77_int ldc = bli_obj_col_stride( &c ); - dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); - dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); - dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b ); - dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); - dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); - - zgemm_( &f77_transa, - &f77_transb, - &mm, - &nn, - &kk, - alphap, - ap, &lda, - bp, &ldb, - betap, - cp, &ldc ); - } -#endif - -#ifdef PRINT - bli_printm( "c after", &c, "%4.1f", "" ); - exit(1); -#endif - - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); - - if ( bli_is_complex( dt ) ) gflops *= 4.0; - - printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); - printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin)/p_inc + 1, - ( unsigned long )m, - ( unsigned long )k, - ( unsigned long )n, gflops ); - - bli_obj_free( &alpha ); - bli_obj_free( &beta ); - - bli_obj_free( &a ); - bli_obj_free( &b ); - bli_obj_free( &c ); - bli_obj_free( &c_save ); - } - - //bli_finalize(); - - return 0; -} - +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#ifdef EIGEN + #define BLIS_DISABLE_BLAS_DEFS + #include "blis.h" + #include + #include + using namespace Eigen; +#else + #include "blis.h" +#endif + +#define COL_STORAGE +//#define ROW_STORAGE + +//#define PRINT + +int main( int argc, char** argv ) +{ + obj_t a, b, c; + obj_t c_save; + obj_t alpha, beta; + dim_t m, n, k; + dim_t p; + dim_t p_begin, p_max, p_inc; + int m_input, n_input, k_input; + ind_t ind; + num_t dt; + char dt_ch; + int r, n_repeats; + trans_t transa; + trans_t transb; + f77_char f77_transa; + f77_char f77_transb; + + double dtime; + double dtime_save; + double gflops; + + //bli_init(); + + //bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING ); + + n_repeats = 3; + + dt = DT; + + ind = IND; + +#if 1 + p_begin = P_BEGIN; + p_max = P_MAX; + p_inc = P_INC; + + m_input = -1; + n_input = -1; + k_input = -1; +#else + p_begin = 40; + p_max = 1000; + p_inc = 40; + + m_input = -1; + n_input = -1; + k_input = -1; +#endif + + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 0 + + cntx_t* cntx; + + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + + // Initialize a context for the current induced method and datatype. + cntx = bli_gks_query_ind_cntx( ind_mod, dt ); + + // Set k to the kc blocksize for the current datatype. + k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + +#elif 1 + + //k_input = 256; + +#endif + + // Choose the char corresponding to the requested datatype. + if ( bli_is_float( dt ) ) dt_ch = 's'; + else if ( bli_is_double( dt ) ) dt_ch = 'd'; + else if ( bli_is_scomplex( dt ) ) dt_ch = 'c'; + else dt_ch = 'z'; + + transa = BLIS_NO_TRANSPOSE; + transb = BLIS_NO_TRANSPOSE; + + bli_param_map_blis_to_netlib_trans( transa, &f77_transa ); + bli_param_map_blis_to_netlib_trans( transb, &f77_transb ); + + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ; + + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, + ( unsigned long )0, + ( unsigned long )0, 0.0 ); + + + //for ( p = p_begin; p <= p_max; p += p_inc ) + for ( p = p_max; p_begin <= p; p -= p_inc ) + { + + if ( m_input < 0 ) m = p / ( dim_t )abs(m_input); + else m = ( dim_t ) m_input; + if ( n_input < 0 ) n = p / ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; + if ( k_input < 0 ) k = p / ( dim_t )abs(k_input); + else k = ( dim_t ) k_input; + + bli_obj_create( dt, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt, 1, 1, 0, 0, &beta ); + + #ifdef COL_STORAGE + bli_obj_create( dt, m, k, 0, 0, &a ); + bli_obj_create( dt, k, n, 0, 0, &b ); + bli_obj_create( dt, m, n, 0, 0, &c ); + bli_obj_create( dt, m, n, 0, 0, &c_save ); + #else + bli_obj_create( dt, m, k, k, 1, &a ); + bli_obj_create( dt, k, n, n, 1, &b ); + bli_obj_create( dt, m, n, n, 1, &c ); + bli_obj_create( dt, m, n, n, 1, &c_save ); + #endif + + bli_randm( &a ); + bli_randm( &b ); + bli_randm( &c ); + + bli_obj_set_conjtrans( transa, &a ); + bli_obj_set_conjtrans( transb, &b ); + + bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (1.0/1.0), 0.0, &beta ); + + bli_copym( &c, &c_save ); + +#if 0 //def BLIS + bli_ind_disable_all_dt( dt ); + bli_ind_enable_dt( ind, dt ); +#endif + +#ifdef EIGEN + double alpha_r, alpha_i; + + bli_getsc( &alpha, &alpha_r, &alpha_i ); + + void* ap = bli_obj_buffer_at_off( &a ); + void* bp = bli_obj_buffer_at_off( &b ); + void* cp = bli_obj_buffer_at_off( &c ); + + #ifdef COL_STORAGE + const int os_a = bli_obj_col_stride( &a ); + const int os_b = bli_obj_col_stride( &b ); + const int os_c = bli_obj_col_stride( &c ); + #else + const int os_a = bli_obj_row_stride( &a ); + const int os_b = bli_obj_row_stride( &b ); + const int os_c = bli_obj_row_stride( &c ); + #endif + + Stride stride_a( os_a, 1 ); + Stride stride_b( os_b, 1 ); + Stride stride_c( os_c, 1 ); + + #ifdef COL_STORAGE + #if defined(IS_FLOAT) + typedef Matrix MatrixXf_; + #elif defined (IS_DOUBLE) + typedef Matrix MatrixXd_; + #elif defined (IS_SCOMPLEX) + typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcf_; + #elif defined (IS_DCOMPLEX) + typedef Matrix, Dynamic, Dynamic, ColMajor> MatrixXcd_; + #endif + #else + #if defined(IS_FLOAT) + typedef Matrix MatrixXf_; + #elif defined (IS_DOUBLE) + typedef Matrix MatrixXd_; + #elif defined (IS_SCOMPLEX) + typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcf_; + #elif defined (IS_DCOMPLEX) + typedef Matrix, Dynamic, Dynamic, RowMajor> MatrixXcd_; + #endif + #endif + #if defined(IS_FLOAT) + Map > A( ( float* )ap, m, k, stride_a ); + Map > B( ( float* )bp, k, n, stride_b ); + Map > C( ( float* )cp, m, n, stride_c ); + #elif defined (IS_DOUBLE) + Map > A( ( double* )ap, m, k, stride_a ); + Map > B( ( double* )bp, k, n, stride_b ); + Map > C( ( double* )cp, m, n, stride_c ); + #elif defined (IS_SCOMPLEX) + Map > A( ( std::complex* )ap, m, k, stride_a ); + Map > B( ( std::complex* )bp, k, n, stride_b ); + Map > C( ( std::complex* )cp, m, n, stride_c ); + #elif defined (IS_DCOMPLEX) + Map > A( ( std::complex* )ap, m, k, stride_a ); + Map > B( ( std::complex* )bp, k, n, stride_b ); + Map > C( ( std::complex* )cp, m, n, stride_c ); + #endif +#endif + + dtime_save = DBL_MAX; + + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &c_save, &c ); + + dtime = bli_clock(); + +#ifdef PRINT + bli_printm( "a", &a, "%4.1f", "" ); + bli_printm( "b", &b, "%4.1f", "" ); + bli_printm( "c", &c, "%4.1f", "" ); +#endif + +#if defined(BLIS) + + bli_gemm( &alpha, + &a, + &b, + &beta, + &c ); + +#elif defined(EIGEN) + + C.noalias() += alpha_r * A * B; + +#else // if defined(BLAS) + + if ( bli_is_float( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + float* alphap = ( float* )bli_obj_buffer( &alpha ); + float* ap = ( float* )bli_obj_buffer( &a ); + float* bp = ( float* )bli_obj_buffer( &b ); + float* betap = ( float* )bli_obj_buffer( &beta ); + float* cp = ( float* )bli_obj_buffer( &c ); + + sgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_double( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + double* alphap = ( double* )bli_obj_buffer( &alpha ); + double* ap = ( double* )bli_obj_buffer( &a ); + double* bp = ( double* )bli_obj_buffer( &b ); + double* betap = ( double* )bli_obj_buffer( &beta ); + double* cp = ( double* )bli_obj_buffer( &c ); + + dgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha ); + scomplex* ap = ( scomplex* )bli_obj_buffer( &a ); + scomplex* bp = ( scomplex* )bli_obj_buffer( &b ); + scomplex* betap = ( scomplex* )bli_obj_buffer( &beta ); + scomplex* cp = ( scomplex* )bli_obj_buffer( &c ); + + cgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } + else if ( bli_is_dcomplex( dt ) ) + { + f77_int mm = bli_obj_length( &c ); + f77_int kk = bli_obj_width_after_trans( &a ); + f77_int nn = bli_obj_width( &c ); + f77_int lda = bli_obj_col_stride( &a ); + f77_int ldb = bli_obj_col_stride( &b ); + f77_int ldc = bli_obj_col_stride( &c ); + dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha ); + dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a ); + dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b ); + dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta ); + dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c ); + + zgemm_( &f77_transa, + &f77_transb, + &mm, + &nn, + &kk, + alphap, + ap, &lda, + bp, &ldb, + betap, + cp, &ldc ); + } +#endif + +#ifdef PRINT + bli_printm( "c after", &c, "%4.1f", "" ); + exit(1); +#endif + + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } + + gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 ); + + if ( bli_is_complex( dt ) ) gflops *= 4.0; + + printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR ); + printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )m, + ( unsigned long )k, + ( unsigned long )n, gflops ); + + bli_obj_free( &alpha ); + bli_obj_free( &beta ); + + bli_obj_free( &a ); + bli_obj_free( &b ); + bli_obj_free( &c ); + bli_obj_free( &c_save ); + } + + //bli_finalize(); + + return 0; +} + diff --git a/test/Makefile b/test/Makefile index 732ef0dd0..572c0dccb 100644 --- a/test/Makefile +++ b/test/Makefile @@ -96,7 +96,7 @@ endif BLAS_LIB_PATH := $(HOME)/flame/lib #MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 #MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 -MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#MKL_LIB_PATH := ${MKLROOT}/lib/intel64 #ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme # OpenBLAS @@ -115,11 +115,11 @@ MKL_LIB := -L$(MKL_LIB_PATH) \ # ESSL # Note: ESSL is named differently for SMP and/or BG -ESSL_TYPE := # This is the 32b library on POWER +#ESSL_TYPE := # This is the 32b library on POWER #ESSL_TYPE := 6464 # This is the 64b library on POWER #ESSL_TYPE := bg # This is the 32b single-threaded library on Blue Gene #ESSL_TYPE := smpbg # This is the 32b multi-threaded library on Blue Gene -ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a +#ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a # Accelerate MAC_LIB := -framework Accelerate @@ -165,22 +165,21 @@ CFLAGS += -I$(TEST_SRC_PATH) #all: blis openblas atlas mkl all: blis openblas mkl -blis: test_dotv_blis.x \ - test_axpyv_blis.x \ - test_gemv_blis.x \ - test_ger_blis.x \ - test_hemv_blis.x \ - test_her_blis.x \ - test_her2_blis.x \ - test_trmv_blis.x \ - test_trsv_blis.x \ - \ - test_gemm_blis.x \ - test_hemm_blis.x \ - test_herk_blis.x \ - test_her2k_blis.x \ - test_trmm_blis.x \ - test_trsm_blis.x +blis: test_gemm_blis.x \ +# test_dotv_blis.x \ +# test_axpyv_blis.x \ +# test_gemv_blis.x \ +# test_ger_blis.x \ +# test_hemv_blis.x \ +# test_her_blis.x \ +# test_her2_blis.x \ +# test_trmv_blis.x \ +# test_trsv_blis.x \ + # test_hemm_blis.x \ + # test_herk_blis.x \ + # test_her2k_blis.x \ + # test_trmm_blis.x \ + # test_trsm_blis.x openblas: \ test_dotv_openblas.x \ diff --git a/test/output_gemm_blis.m b/test/output_gemm_blis.m new file mode 100644 index 000000000..e69de29bb diff --git a/test/runme.sh b/test/runme.sh index edef984cb..4e363ead5 100755 --- a/test/runme.sh +++ b/test/runme.sh @@ -5,12 +5,14 @@ out_root="output" #out_root="output_square" # Operations to test. -l2_ops="gemv ger hemv her her2 trmv trsv" -l3_ops="gemm hemm herk her2k trmm trsm" -test_ops="${l2_ops} ${l3_ops}" +# l2_ops="gemv ger hemv her her2 trmv trsv" +l3_ops="gemm" +# "hemm herk her2k trmm trsm" +test_ops=" ${l3_ops}" +# "${l2_ops}" -# Implementations to test -test_impls="openblas atlas mkl blis" +# Implementations to test | "openblas atlas mkl" +test_impls="blis" for im in ${test_impls}; do @@ -22,7 +24,7 @@ for im in ${test_impls}; do # Construct the name of the output file. out_file="${out_root}_${op}_${im}.m" - echo "Running ${exec_name} > ${out_file}" + echo " Running ${exec_name} > ${out_file} " # Run executable. ./${exec_name} > ${out_file} diff --git a/testsuite/input.general b/testsuite/input.general index 772840224..e64346f57 100644 --- a/testsuite/input.general +++ b/testsuite/input.general @@ -8,8 +8,8 @@ # accepted values. # -1 # Number of repeats per experiment (best result is reported) -rc # Matrix storage scheme(s) to test: +3 # Number of repeats per experiment (best result is reported) +c # Matrix storage scheme(s) to test: # 'c' = col-major storage; 'g' = general stride storage; # 'r' = row-major storage cj # Vector storage scheme(s) to test: @@ -22,14 +22,14 @@ cj # Vector storage scheme(s) to test: # '0' = real values on [-1,1]; # '1' = powers of 2 in narrow precision range 32 # General stride spacing (for cases when testing general stride) -sdcz # Datatype(s) to test: +d # Datatype(s) to test: # 's' = single real; 'c' = single complex; # 'd' = double real; 'z' = double complex 0 # Test gemm with mixed-domain operands? 0 # Test gemm with mixed-precision operands? -100 # Problem size: first to test -500 # Problem size: maximum to test -100 # Problem size: increment between experiments +2000 # Problem size: first to test +2000 # Problem size: maximum to test +200 # Problem size: increment between experiments # Complex level-3 implementations to test: 0 # 3mh ('1' = enable; '0' = disable) 0 # 3m1 ('1' = enable; '0' = disable) @@ -45,5 +45,5 @@ sdcz # Datatype(s) to test: # '0' = disable error checking; '1' = full error checking i # Reaction to test failure: # 'i' = ignore; 's' = sleep() and continue; 'a' = abort -0 # Output results in matlab/octave format? ('1' = yes; '0' = no) +1 # Output results in matlab/octave format? ('1' = yes; '0' = no) 0 # Output results to stdout AND files? ('1' = yes; '0' = no) diff --git a/testsuite/input.operations b/testsuite/input.operations index f35e2cd9b..3fe003155 100644 --- a/testsuite/input.operations +++ b/testsuite/input.operations @@ -276,9 +276,9 @@ # --- Level-3 -------------------------------------------------------------- -1 # gemm --1 -1 -1 # dimensions: m n k -?? # parameters: transa transb +2 # gemm +-1 -1 -1 # dimensions: m n k +nn # parameters: transa transb 1 # hemm -1 -1 # dimensions: m n diff --git a/testsuite/jobscripts/cfig.out b/testsuite/jobscripts/cfig.out new file mode 100644 index 000000000..f8d2707cb --- /dev/null +++ b/testsuite/jobscripts/cfig.out @@ -0,0 +1,106 @@ +configure: detected Linux kernel version 4.14.0-115.6.1.el7a.ppc64le. +configure: python interpeter search list is: python python3 python2. +configure: using 'python' python interpreter. +configure: found python version 2.7.5 (maj: 2, min: 7, rev: 5). +configure: python 2.7.5 appears to be supported. +configure: C compiler search list is: gcc clang cc. +configure: using 'gcc' C compiler. +configure: C++ compiler search list is: g++ clang++ c++. +configure: using 'g++' C++ compiler (for sandbox only). +configure: found gcc version 8.2.0 (maj: 8, min: 2, rev: 0). +configure: checking for blacklisted configurations due to gcc 8.2.0. +configure: found assembler ('as') version 2.27 (maj: 2, min: 27, rev: ). +configure: checking for blacklisted configurations due to as 2.27. +configure: warning: assembler ('as' 2.27) does not support 'bulldozer'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'sandybridge'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'haswell'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'piledriver'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'steamroller'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'excavator'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'skx'; adding to blacklist. +configure: warning: assembler ('as' 2.27) does not support 'knl'; adding to blacklist. +configure: configuration blacklist: +configure: bulldozer sandybridge haswell piledriver steamroller excavator skx knl +configure: reading configuration registry...done. +configure: determining default version string. +configure: found '.git' directory; assuming git clone. +configure: executing: git describe --tags. +configure: git returned an error: 'Unknown option: -C +usage: git [--version] [--help] [-c name=value] + [--exec-path[=]] [--html-path] [--man-path] [--info-path] + [-p|--paginate|--no-pager] [--no-replace-objects] [--bare] + [--git-dir=] [--work-tree=] [--namespace=] + []'. +configure: using string from unmodified version file. +configure: starting configuration of BLIS 0.6.0. +configure: configuring with official version string. +configure: found shared library .so version '2.0.0'. +configure: .so major version: 2 +configure: .so minor.build version: 0.0 +configure: manual configuration requested; configuring with 'power9'. +configure: checking configuration against contents of 'config_registry'. +configure: configuration 'power9' is registered. +configure: 'power9' is defined as having the following sub-configurations: +configure: power9 +configure: which collectively require the following kernels: +configure: power9 +configure: checking sub-configurations: +configure: 'power9' is registered...and exists. +configure: checking sub-configurations' requisite kernels: +configure: 'power9' kernels...exist. +configure: no install prefix option given; defaulting to '/usr/local'. +configure: no install exec_prefix option given; defaulting to PREFIX. +configure: no install libdir option given; defaulting to EXECPREFIX/lib. +configure: no install includedir option given; defaulting to PREFIX/include. +configure: no install sharedir option given; defaulting to PREFIX/share. +configure: final installation directories: +configure: prefix: /usr/local +configure: exec_prefix: ${prefix} +configure: libdir: ${exec_prefix}/lib +configure: includedir: ${prefix}/include +configure: sharedir: ${prefix}/share +configure: NOTE: the variables above can be overridden when running make. +configure: no preset CFLAGS detected. +configure: no preset LDFLAGS detected. +configure: debug symbols disabled. +configure: disabling verbose make output. (enable with 'make V=1'.) +configure: disabling ARG_MAX hack. +configure: building BLIS as both static and shared libraries. +configure: exporting only public symbols within shared library. +configure: threading is disabled. +configure: requesting slab threading in jr and ir loops. +configure: internal memory pools for packing blocks are enabled. +configure: internal memory pools for small blocks are enabled. +configure: memory tracing output is disabled. +configure: libmemkind not found; disabling. +configure: compiler appears to not support #pragma omp simd. +configure: the BLAS compatibility layer is enabled. +configure: the CBLAS compatibility layer is disabled. +configure: mixed datatype support is enabled. +configure: mixed datatype optimizations requiring extra memory are enabled. +configure: small matrix handling is enabled. +configure: the BLIS API integer size is automatically determined. +configure: the BLAS/CBLAS API integer size is 32-bit. +configure: configuring for conventional gemm implementation. +configure: creating ./config.mk from ./build/config.mk.in +configure: creating ./bli_config.h from ./build/bli_config.h.in +configure: creating ./obj/power9 +configure: creating ./obj/power9/config/power9 +configure: creating ./obj/power9/kernels/power9 +configure: creating ./obj/power9/ref_kernels/power9 +configure: creating ./obj/power9/frame +configure: creating ./obj/power9/blastest +configure: creating ./obj/power9/testsuite +configure: creating ./lib/power9 +configure: creating ./include/power9 +configure: mirroring ./config/power9 to ./obj/power9/config/power9 +configure: mirroring ./kernels/power9 to ./obj/power9/kernels/power9 +configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels +configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels/power9 +configure: mirroring ./frame to ./obj/power9/frame +configure: creating makefile fragments in ./obj/power9/config/power9 +configure: creating makefile fragments in ./obj/power9/kernels/power9 +configure: creating makefile fragments in ./obj/power9/ref_kernels +configure: creating makefile fragments in ./obj/power9/frame +configure: configured to build within top-level directory of source distribution. +CONFIGURE DONE diff --git a/testsuite/jobscripts/cfig.sh b/testsuite/jobscripts/cfig.sh new file mode 100755 index 000000000..b8927d7a6 --- /dev/null +++ b/testsuite/jobscripts/cfig.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +cd ~/blis +./configure power9 +echo "CONFIGURE DONE" diff --git a/testsuite/jobscripts/jb-cfig.sh b/testsuite/jobscripts/jb-cfig.sh new file mode 100644 index 000000000..493fb7e70 --- /dev/null +++ b/testsuite/jobscripts/jb-cfig.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# execute in the general partition +#SBATCH --partition=general + +# execute with 40 processes/tasks +#SBATCH --ntasks=1 + +# maximum time is 30 minutes +#SBATCH --time=00:30:00 + +# job name is my_job +#SBATCH --job-name=blis + +# send email for status updates +#SBATCH --mail-type=ALL,TIME_LIMIT +#SBATCH --mail-user=ntukanov + +# change default output file name +#SBATCH --output=cfig.out + +# load environment +module load gcc/8.2 + +# application execution +srun cfig.sh diff --git a/testsuite/jobscripts/jb-mk.sh b/testsuite/jobscripts/jb-mk.sh new file mode 100644 index 000000000..b2b56d665 --- /dev/null +++ b/testsuite/jobscripts/jb-mk.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# execute in the general partition +#SBATCH --partition=general + +# execute with 40 processes/tasks +#SBATCH --ntasks=1 + +# maximum time is 30 minutes +#SBATCH --time=00:30:00 + +# job name is my_job +#SBATCH --job-name=blis + +# send email for status updates +#SBATCH --mail-type=ALL,TIME_LIMIT +#SBATCH --mail-user=ntukanov + +# change default output file name +#SBATCH --output=mk.out + +# load environment +module load gcc/8.2 + +# application execution +srun mk.sh diff --git a/testsuite/jobscripts/jb-runtest.sh b/testsuite/jobscripts/jb-runtest.sh new file mode 100644 index 000000000..502b35fb6 --- /dev/null +++ b/testsuite/jobscripts/jb-runtest.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# execute in the general partition +#SBATCH --partition=general + +# execute with 40 processes/tasks +#SBATCH --ntasks=1 + +# maximum time is 30 minutes +#SBATCH --time=00:30:00 + +# job name is my_job +#SBATCH --job-name=blis + +# send email for status updates +#SBATCH --mail-type=ALL,TIME_LIMIT +#SBATCH --mail-user=ntukanov + +# change default output file name +#SBATCH --output=runtest.out + +# load environment +module load gcc/8.2 + +# application execution +srun runtest.sh diff --git a/testsuite/jobscripts/mk.out b/testsuite/jobscripts/mk.out new file mode 100644 index 000000000..df324b23e --- /dev/null +++ b/testsuite/jobscripts/mk.out @@ -0,0 +1,9 @@ +Removing flattened header files from include/power9 +Removing object files from ./obj/power9 +srun: Job step aborted: Waiting up to 32 seconds for job step to finish. +srun: got SIGCONT +slurmstepd: error: *** JOB 1155 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 *** +srun: forcing job termination +slurmstepd: error: *** STEP 1155.0 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 *** +make: *** [cleanlib] Terminated +srun: error: lookout00: task 0: Terminated diff --git a/testsuite/jobscripts/mk.sh b/testsuite/jobscripts/mk.sh new file mode 100755 index 000000000..186ed9f25 --- /dev/null +++ b/testsuite/jobscripts/mk.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cd ~/blis +make clean +make +echo "MAKE DONE" diff --git a/testsuite/jobscripts/runtest.sh b/testsuite/jobscripts/runtest.sh new file mode 100755 index 000000000..1650d2b69 --- /dev/null +++ b/testsuite/jobscripts/runtest.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cd ~/blis/testsuite +rm -rf test_libblis.out +make clean +make -j +./test_libblis.x > test_libblis.out +echo "TEST DONE" diff --git a/testsuite/src/test_gemm.c b/testsuite/src/test_gemm.c index c214b392f..711caa0e1 100644 --- a/testsuite/src/test_gemm.c +++ b/testsuite/src/test_gemm.c @@ -272,11 +272,23 @@ void libblis_test_gemm_experiment { bli_copym( &c_save, &c ); +#if 0 +bli_printm( "alpha", &alpha, "%5.2f", "" ); +bli_printm( "beta", &beta, "%5.2f", "" ); +bli_printm( "a = [", &a, "%7.6f", "];" ); +bli_printm( "b = [", &b, "%7.6f", "];" ); +bli_printm( "c = [", &c, "%7.6f", "];" ); +#endif + time = bli_clock(); libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c ); time_min = bli_clock_min_diff( time_min, time ); +#if 0 +bli_printm( "c_after = [", &c, "%7.6f", "];" ); +#endif + } // Estimate the performance of the best experiment repeat. @@ -405,7 +417,6 @@ void libblis_test_gemm_md libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c ); - time_min = bli_clock_min_diff( time_min, time ); } // Estimate the performance of the best experiment repeat. @@ -442,20 +453,18 @@ void libblis_test_gemm_impl { case BLIS_TEST_SEQ_FRONT_END: #if 0 -//bli_printm( "alpha", alpha, "%5.2f", "" ); -//bli_printm( "beta", beta, "%5.2f", "" ); -bli_printm( "a", a, "%5.2f", "" ); -bli_printm( "b", b, "%5.2f", "" ); -bli_printm( "c", c, "%5.2f", "" ); +bli_printm( "alpha", alpha, "%5.2f", "" ); +bli_printm( "beta", beta, "%5.2f", "" ); +bli_printm( "a", a, "%6.3f", "" ); +bli_printm( "b", b, "%6.3f", "" ); +bli_printm( "c", c, "%6.3f", "" ); #endif //if ( bli_obj_length( b ) == 16 && // bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR ) //bli_printm( "c before", c, "%6.3f", "" ); bli_gemm( alpha, a, b, beta, c ); #if 0 -if ( bli_obj_length( c ) == 12 && - bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR ) -bli_printm( "c after", c, "%6.3f", "" ); +bli_printm( "c after", c, "%6.3f", ""); #endif break;