mirror of
https://github.com/amd/blis.git
synced 2026-03-28 05:07:22 +00:00
POWER9 DGEMM (#355)
Implemented and registered power9 dgemm ukernel. Details: - Implemented 12x6 dgemm microkernel for power9. This microkernel assumes that elements of B have been duplicated/broadcast during the packing step. The microkernel uses a column orientation for its microtile vector registers and thus implements column storage and general stride IO cases. (A row storage IO case via in-register transposition may be added at a future date.) It should be noted that we recommend using this microkernel with gcc and *not* xlc, as issues with the latter cropped up during development, including but not limited to slightly incompatible vector register mnemonics in the GNU extended inline assembly clobber list.
This commit is contained in:
committed by
Field G. Van Zee
parent
58102aeaa2
commit
b426f9e04e
@@ -34,44 +34,55 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// Instantiate prototypes for packm kernels.
|
||||
PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref )
|
||||
|
||||
// Instantiate prototypes for level-3 kernels.
|
||||
//GEMM_UKR_PROT( double, d, gemmbb_power9_ref )
|
||||
|
||||
|
||||
void bli_cntx_init_power9( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_power9_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
// bli_cntx_set_l3_nat_ukrs
|
||||
// (
|
||||
// 1,
|
||||
// BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE,
|
||||
// cntx
|
||||
// );
|
||||
/*
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 );
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
1,
|
||||
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized packm kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
|
||||
-1, 12, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
*/
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -32,34 +32,10 @@
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
|
||||
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
|
||||
|
||||
//#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
//#define BLIS_SIMD_SIZE 64
|
||||
//
|
||||
//#ifdef BLIS_NO_HBWMALLOC
|
||||
// #include <stdlib.h>
|
||||
// #define BLIS_MALLOC_POOL malloc
|
||||
// #define BLIS_FREE_POOL free
|
||||
//#else
|
||||
// #include <hbwmalloc.h>
|
||||
// #define BLIS_MALLOC_POOL hbw_malloc
|
||||
// #define BLIS_FREE_POOL hbw_free
|
||||
//#endif
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 192
|
||||
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 152
|
||||
|
||||
|
||||
#if 0
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
#define BLIS_DEFAULT_MC_D 64
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
#endif
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
@@ -45,8 +46,8 @@ THIS_CONFIG := power9
|
||||
# NOTE: The build system will append these variables with various
|
||||
# general-purpose/configuration-agnostic flags in common.mk. You
|
||||
# may specify additional flags here as needed.
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS := -mcpu=power9
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS :=
|
||||
CPICFLAGS :=
|
||||
CWARNFLAGS :=
|
||||
|
||||
@@ -57,28 +58,25 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -funroll-loops
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS :=
|
||||
CKVECFLAGS := -mcpu=power9 -mtune=power9 -DXLC=0
|
||||
else
|
||||
$(error gcc is required for this configuration.)
|
||||
ifeq ($(CC_VENDOR),IBM)
|
||||
CKVECFLAGS := -qarch=pwr9 -qtune=pwr9 -DXLC=1
|
||||
else
|
||||
$(info $(CC_VENDOR))
|
||||
$(error gcc/xlc is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
|
||||
else
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -39,7 +39,7 @@ cortexa15: cortexa15/armv7a
|
||||
cortexa9: cortexa9/armv7a
|
||||
|
||||
# IBM architectures.
|
||||
power9: power9/generic
|
||||
power9: power9
|
||||
bgq: bgq
|
||||
|
||||
# Generic architectures.
|
||||
|
||||
@@ -66,6 +66,7 @@ void bli_gemm_front
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
@@ -82,6 +83,7 @@ void bli_gemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
cntx_t cntx_local;
|
||||
|
||||
@@ -148,6 +150,7 @@ void bli_gemm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
@@ -275,6 +278,7 @@ void bli_gemm_front
|
||||
cntl
|
||||
);
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If we created a temporary matrix conformal to C for whatever reason,
|
||||
|
||||
@@ -167,7 +167,7 @@ void bli_gemm_ker_var2
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
|
||||
@@ -141,15 +141,15 @@ void bli_arch_set_id( void )
|
||||
#endif
|
||||
|
||||
// IBM microarchitectures.
|
||||
#ifdef BLIS_FAMILY_POWER9
|
||||
id = BLIS_ARCH_POWER9;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_POWER7
|
||||
id = BLIS_ARCH_POWER7;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_BGQ
|
||||
id = BLIS_ARCH_BGQ;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_POWER9
|
||||
id = BLIS_ARCH_POWER9;
|
||||
#endif
|
||||
|
||||
// Generic microarchitecture.
|
||||
#ifdef BLIS_FAMILY_GENERIC
|
||||
@@ -188,9 +188,9 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
|
||||
"cortexa15",
|
||||
"cortexa9",
|
||||
|
||||
"power9",
|
||||
"power7",
|
||||
"bgq",
|
||||
"power9",
|
||||
|
||||
"generic"
|
||||
};
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
@@ -19,7 +16,6 @@
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
@@ -31,7 +27,6 @@
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CNTX_H
|
||||
@@ -45,28 +40,22 @@ typedef struct cntx_s
|
||||
{
|
||||
blksz_t* blkszs;
|
||||
bszid_t* bmults;
|
||||
|
||||
func_t* l3_vir_ukrs;
|
||||
func_t* l3_nat_ukrs;
|
||||
mbool_t* l3_nat_ukrs_prefs;
|
||||
|
||||
blksz_t* l3_sup_thresh;
|
||||
void** l3_sup_handlers;
|
||||
blksz_t* l3_sup_blkszs;
|
||||
func_t* l3_sup_kers;
|
||||
mbool_t* l3_sup_kers_prefs;
|
||||
|
||||
func_t* l1f_kers;
|
||||
func_t* l1v_kers;
|
||||
|
||||
func_t* packm_kers;
|
||||
func_t* unpackm_kers;
|
||||
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
} cntx_t;
|
||||
*/
|
||||
|
||||
|
||||
@@ -268,6 +268,9 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- IBM BG/Q --
|
||||
|
||||
#ifdef BLIS_KERNELS_POWER9
|
||||
#include "bli_kernels_power9.h"
|
||||
#endif
|
||||
#ifdef BLIS_KERNELS_POWER7
|
||||
#include "bli_kernels_power7.h"
|
||||
#endif
|
||||
|
||||
@@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 )
|
||||
|
||||
|
||||
// gemm (asm d8x6)
|
||||
// gemm (asm d8x6)
|
||||
//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 )
|
||||
//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 )
|
||||
//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )
|
||||
|
||||
201
kernels/power9/3/bli_gemm_power9_asm_d12x6.c
Normal file
201
kernels/power9/3/bli_gemm_power9_asm_d12x6.c
Normal file
@@ -0,0 +1,201 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "bli_pwr9_asm_macros_12x6.h"
|
||||
|
||||
void bli_dgemm_power9_asm_12x6
|
||||
(
|
||||
dim_t k0,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c0, inc_t cs_c0,
|
||||
auxinfo_t* restrict data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
// Typecast local copies of integers in case dim_t and inc_t are a
|
||||
// different size than is expected by load instructions.
|
||||
|
||||
uint64_t k_iter = k0 / 16;
|
||||
uint64_t k_left = k0 % 16;
|
||||
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
"ld %%r7, %2 \n\t" // load ptr of A
|
||||
"ld %%r8, %3 \n\t" // load ptr of B
|
||||
"ld %%r16, %6 \n\t" // load ptr of C
|
||||
" \n\t"
|
||||
"ld %%r28, %4 \n\t" // load ptr for alpha
|
||||
"ld %%r29, %5 \n\t" // load ptr for beta
|
||||
" \n\t"
|
||||
"ld %%r11, %0 \n\t" // load k_iter
|
||||
"ld %%r12, %1 \n\t" // load k_left
|
||||
" \n\t"
|
||||
"ld %%r10, %8 \n\t" // load cs_c
|
||||
"slwi %%r10, %%r10, 3 \n\t" // mul by size of elem
|
||||
" \n\t"
|
||||
"ld %%r9, %7 \n\t" // load rs_c
|
||||
"slwi %%r9, %%r9, 3 \n\t" // mul by size of elem
|
||||
" \n\t"
|
||||
"ld %%r26, 0(%%r29) \n\t" // load val of beta
|
||||
" \n\t"
|
||||
"lxvdsx %%vs62, 0, %%r28 \n\t" // splat alpha
|
||||
"lxvdsx %%vs63, 0, %%r29 \n\t" // splat beta
|
||||
" \n\t"
|
||||
"add %%r17, %%r16, %%r10 \n\t" // addr of col 1 of C
|
||||
"add %%r18, %%r17, %%r10 \n\t" // col 2 of C
|
||||
"add %%r19, %%r18, %%r10 \n\t" // col 3 of C
|
||||
"add %%r20, %%r19, %%r10 \n\t" // col 4 of C
|
||||
"add %%r21, %%r20, %%r10 \n\t" // col 5 of C
|
||||
" \n\t"
|
||||
DZERO_OUT_VREG
|
||||
" \n\t"
|
||||
DPRELOAD
|
||||
" \n\t"
|
||||
"addi %%r8, %%r8, 96 \n\t" // move to next col/row of A/B
|
||||
"addi %%r7, %%r7, 96 \n\t"
|
||||
" \n\t"
|
||||
DPREFETCH
|
||||
" \n\t"
|
||||
"cmpwi %%r0, %%r11, 0 \n\t" // if k_iter == 0,
|
||||
"beq %%r0, DCONSIDERKLEFT \n\t" // then jmp to k_left
|
||||
"mtctr %%r11 \n\t" // else, do k_iter loop
|
||||
" \n\t"
|
||||
"DLOOPKITER: \n\t" // k_iter loop
|
||||
" \n\t"
|
||||
A_B_PRODUCT_16 // compute A*B
|
||||
" \n\t"
|
||||
"bdnz DLOOPKITER \n\t"
|
||||
" \n\t"
|
||||
"DCONSIDERKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"cmpwi %%r0, %%r12, 0 \n\t" // if k_left == 0,
|
||||
"beq %%r0, DPOSTACCUM \n\t" // then jmp to post accum
|
||||
"mtctr %%r12 \n\t" // else, do k_left loop
|
||||
" \n\t"
|
||||
"DLOOPKLEFT: \n\t" // k_left loop
|
||||
" \n\t"
|
||||
A_B_PRODUCT_1
|
||||
" \n\t"
|
||||
"bdnz DLOOPKLEFT \n\t"
|
||||
" \n\t"
|
||||
"DPOSTACCUM: \n\t"
|
||||
" \n\t"
|
||||
DSCALE_ALPHA
|
||||
" \n\t"
|
||||
"cmpdi %%r0, %%r26, 0 \n\t" // if beta == 0,
|
||||
"beq %%r0, DBETAZERO \n\t" // then jmp to BZ
|
||||
" \n\t"
|
||||
"cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8
|
||||
"beq DCOLSTOREDBNZ \n\t" // then jmp to col store
|
||||
" \n\t"
|
||||
"DGENSTOREDBNZ: \n\t" // BNZ gen stored case
|
||||
" \n\t"
|
||||
DGEN_LOAD_OFS_C
|
||||
" \n\t"
|
||||
DGEN_SCALE_BETA
|
||||
" \n\t"
|
||||
"b DGENSTORED \n\t"
|
||||
" \n\t"
|
||||
"DCOLSTOREDBNZ: \n\t" // BNZ col stored case
|
||||
" \n\t"
|
||||
DCOL_SCALE_BETA
|
||||
" \n\t"
|
||||
"b DCOLSTORED \n\t"
|
||||
" \n\t"
|
||||
"DBETAZERO: \n\t" // BZ case
|
||||
" \n\t"
|
||||
"cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8,
|
||||
"beq DCOLSTORED \n\t" // C is col stored
|
||||
" \n\t"
|
||||
"DGENSTORED: \n\t" // BZ gen stored case
|
||||
" \n\t"
|
||||
DGEN_LOAD_OFS_C
|
||||
" \n\t"
|
||||
DGEN_STORE
|
||||
" \n\t"
|
||||
"b DDONE \n\t"
|
||||
" \n\t"
|
||||
"DCOLSTORED: \n\t" // BZ col stored case
|
||||
" \n\t"
|
||||
DCOL_STORE
|
||||
" \n\t"
|
||||
"DDONE: \n\t"
|
||||
" \n\t"
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
/* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */
|
||||
"r0", "r7", "r8", "r9",
|
||||
"r10", "r11", "r12", "r16", "r17", "r18", "r19",
|
||||
"r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29"
|
||||
|
||||
#if XLC
|
||||
,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"
|
||||
, "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19"
|
||||
, "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29"
|
||||
, "f30" ,"f31"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"
|
||||
, "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"
|
||||
, "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
|
||||
, "v30", "v31"
|
||||
#else
|
||||
, "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9"
|
||||
, "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19"
|
||||
, "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29"
|
||||
, "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39"
|
||||
, "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49"
|
||||
, "vs50", "vs51", "vs52", "vs53"
|
||||
#endif
|
||||
|
||||
);
|
||||
}
|
||||
1574
kernels/power9/3/bli_pwr9_asm_macros_12x6.h
Normal file
1574
kernels/power9/3/bli_pwr9_asm_macros_12x6.h
Normal file
File diff suppressed because it is too large
Load Diff
47
kernels/power9/bli_kernels_power9.h
Normal file
47
kernels/power9/bli_kernels_power9.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// -- level-3 --
|
||||
|
||||
// gemm (asm d12x6)
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_12x6 )
|
||||
|
||||
// gemm (asm d18x4)
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_18x4 )
|
||||
|
||||
// gemm (asm d4x16)
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_16x4 )
|
||||
|
||||
// gemm (asm d4x16)
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )
|
||||
@@ -130,9 +130,9 @@ VENDORP_LIB := $(MKLP_LIB)
|
||||
#
|
||||
|
||||
# Single core (single-threaded)
|
||||
PS_BEGIN := 48
|
||||
PS_MAX := 2400
|
||||
PS_INC := 48
|
||||
PS_BEGIN := 100
|
||||
PS_MAX := 1000
|
||||
PS_INC := 100
|
||||
|
||||
# Single-socket (multithreaded)
|
||||
P1_BEGIN := 96
|
||||
@@ -242,8 +242,8 @@ blis-2s: blis-nat-2s
|
||||
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
|
||||
|
||||
# Define the datatypes, operations, and implementations.
|
||||
DTS := s d c z
|
||||
OPS := gemm hemm herk trmm trsm
|
||||
DTS := d # s d c z
|
||||
OPS := gemm # hemm herk trmm trsm
|
||||
BIMPLS := asm_blis openblas vendor
|
||||
EIMPLS := eigen
|
||||
|
||||
|
||||
464
test/3/Makefile_cpy1
Normal file
464
test/3/Makefile_cpy1
Normal file
@@ -0,0 +1,464 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Field G. Van Zee
|
||||
#
|
||||
# Makefile for standalone BLIS test drivers.
|
||||
#
|
||||
|
||||
#
|
||||
# --- Makefile PHONY target definitions ----------------------------------------
|
||||
#
|
||||
|
||||
.PHONY: all \
|
||||
clean cleanx
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Determine makefile fragment location -------------------------------------
|
||||
#
|
||||
|
||||
# Comments:
|
||||
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
|
||||
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
|
||||
# the second case because CONFIG_NAME is not yet set.
|
||||
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
|
||||
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
|
||||
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
|
||||
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
|
||||
else
|
||||
DIST_PATH := ../..
|
||||
LIB_PATH = ../../lib/$(CONFIG_NAME)
|
||||
INC_PATH = ../../include/$(CONFIG_NAME)
|
||||
SHARE_PATH := ../..
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include common makefile definitions --------------------------------------
|
||||
#
|
||||
|
||||
# Include the common makefile fragment.
|
||||
-include $(SHARE_PATH)/common.mk
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- BLAS implementations -----------------------------------------------------
|
||||
#
|
||||
|
||||
# BLAS library path(s). This is where the BLAS libraries reside.
|
||||
HOME_LIB_PATH := $(HOME)/flame/lib
|
||||
#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
|
||||
|
||||
# OpenBLAS
|
||||
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
|
||||
# OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
|
||||
|
||||
# ATLAS
|
||||
#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
|
||||
# $(HOME_LIB_PATH)/libatlas.a
|
||||
|
||||
# Eigen
|
||||
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
|
||||
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
|
||||
EIGENP_LIB := $(EIGEN_LIB)
|
||||
|
||||
# MKL
|
||||
MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_intel_lp64 \
|
||||
-lmkl_core \
|
||||
-lmkl_sequential \
|
||||
-lpthread -lm -ldl
|
||||
#MKLP_LIB := -L$(MKL_LIB_PATH) \
|
||||
# -lmkl_intel_thread \
|
||||
# -lmkl_core \
|
||||
# -lmkl_intel_ilp64 \
|
||||
# -L$(ICC_LIB_PATH) \
|
||||
# -liomp5
|
||||
# MKLP_LIB := -L$(MKL_LIB_PATH) \
|
||||
# -lmkl_intel_lp64 \
|
||||
# -lmkl_core \
|
||||
# -lmkl_gnu_thread \
|
||||
# -lpthread -lm -ldl -fopenmp
|
||||
# #-L$(ICC_LIB_PATH) \
|
||||
# #-lgomp
|
||||
|
||||
VENDOR_LIB := $(MKL_LIB)
|
||||
VENDORP_LIB := $(MKLP_LIB)
|
||||
|
||||
|
||||
#
|
||||
# --- Problem size definitions -------------------------------------------------
|
||||
#
|
||||
|
||||
# Single core (single-threaded)
|
||||
PS_BEGIN := 100
|
||||
PS_MAX := 1000
|
||||
PS_INC := 100
|
||||
|
||||
# Single-socket (multithreaded)
|
||||
P1_BEGIN := 120
|
||||
P1_MAX := 6000
|
||||
P1_INC := 120
|
||||
|
||||
# Dual-socket (multithreaded)
|
||||
P2_BEGIN := 160
|
||||
P2_MAX := 8000
|
||||
P2_INC := 160
|
||||
|
||||
|
||||
#
|
||||
# --- General build definitions ------------------------------------------------
|
||||
#
|
||||
|
||||
TEST_SRC_PATH := .
|
||||
TEST_OBJ_PATH := .
|
||||
|
||||
# Gather all local object files.
|
||||
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c)))
|
||||
|
||||
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
|
||||
# get-user-cflags-for() is not cluttered up with include paths needed only
|
||||
# while building BLIS.
|
||||
CINCFLAGS := -I$(INC_PATH)
|
||||
|
||||
# Use the "framework" CFLAGS for the configuration family.
|
||||
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
|
||||
|
||||
# Add local header paths to CFLAGS.
|
||||
CFLAGS += -I$(TEST_SRC_PATH)
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Define a set of CFLAGS for use with C++ and Eigen.
|
||||
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
|
||||
CXXFLAGS += -I$(EIGEN_INC)
|
||||
|
||||
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
|
||||
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
|
||||
CXXFLAGS_MT := -march=native $(CXXFLAGS)
|
||||
|
||||
|
||||
# Which library?
|
||||
BLI_DEF := -DBLIS
|
||||
BLA_DEF := -DBLAS
|
||||
EIG_DEF := -DEIGEN
|
||||
|
||||
# Complex implementation type
|
||||
D3MHW := -DIND=BLIS_3MH
|
||||
D3M1 := -DIND=BLIS_3M1
|
||||
D4MHW := -DIND=BLIS_4MH
|
||||
D4M1B := -DIND=BLIS_4M1B
|
||||
D4M1A := -DIND=BLIS_4M1A
|
||||
D1M := -DIND=BLIS_1M
|
||||
DNAT := -DIND=BLIS_NAT
|
||||
|
||||
# Implementation string
|
||||
#STR_3MHW := -DSTR=\"3mhw\"
|
||||
#STR_3M1 := -DSTR=\"3m1\"
|
||||
#STR_4MHW := -DSTR=\"4mhw\"
|
||||
#STR_4M1B := -DSTR=\"4m1b\"
|
||||
#STR_4M1A := -DSTR=\"4m1a\"
|
||||
#STR_1M := -DSTR=\"1m\"
|
||||
STR_NAT := -DSTR=\"asm_blis\"
|
||||
STR_OBL := -DSTR=\"openblas\"
|
||||
STR_EIG := -DSTR=\"eigen\"
|
||||
STR_VEN := -DSTR=\"vendor\"
|
||||
|
||||
# Single or multithreaded string
|
||||
STR_ST := -DTHR_STR=\"st\"
|
||||
STR_1S := -DTHR_STR=\"1s\"
|
||||
STR_2S := -DTHR_STR=\"2s\"
|
||||
|
||||
# Problem size specification
|
||||
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX)
|
||||
PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
|
||||
PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
all: all-st all-1s all-2s
|
||||
blis: blis-st blis-1s blis-2s
|
||||
openblas: openblas-st openblas-1s openblas-2s
|
||||
eigen: eigen-st eigen-1s eigen-2s
|
||||
vendor: vendor-st vendor-1s vendor-2s
|
||||
mkl: vendor
|
||||
armpl: vendor
|
||||
|
||||
all-st: blis-st openblas-st mkl-st
|
||||
all-1s: blis-1s openblas-1s mkl-1s
|
||||
all-2s: blis-2s openblas-2s mkl-2s
|
||||
|
||||
blis-st: blis-nat-st
|
||||
blis-1s: blis-nat-1s
|
||||
blis-2s: blis-nat-2s
|
||||
|
||||
#blis-ind: blis-ind-st blis-ind-mt
|
||||
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
|
||||
|
||||
# Define the datatypes, operations, and implementations.
|
||||
DTS := d #s d c z
|
||||
OPS := gemm #hemm herk trmm trsm
|
||||
IMPLS := asm_blis openblas vendor
|
||||
|
||||
# Define functions to construct object filenames from the datatypes and
|
||||
# operations given an implementation. We define one function for single-
|
||||
# threaded, single-socket, and dual-socket filenames.
|
||||
get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
|
||||
get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
|
||||
get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
|
||||
|
||||
# Construct object and binary names for single-threaded, single-socket, and
|
||||
# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
|
||||
BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
|
||||
BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
|
||||
BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
|
||||
BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
|
||||
BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
|
||||
BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
|
||||
|
||||
OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
|
||||
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
|
||||
OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
|
||||
OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
|
||||
OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
|
||||
OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
|
||||
|
||||
EIGEN_ST_OBJS := $(call get-st-objs,eigen)
|
||||
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
|
||||
EIGEN_1S_OBJS := $(call get-1s-objs,eigen)
|
||||
EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
|
||||
EIGEN_2S_OBJS := $(call get-2s-objs,eigen)
|
||||
EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
|
||||
|
||||
VENDOR_ST_OBJS := $(call get-st-objs,vendor)
|
||||
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
|
||||
VENDOR_1S_OBJS := $(call get-1s-objs,vendor)
|
||||
VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
|
||||
VENDOR_2S_OBJS := $(call get-2s-objs,vendor)
|
||||
VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
|
||||
|
||||
# Define some targets associated with the above object/binary files.
|
||||
blis-nat-st: $(BLIS_NAT_ST_BINS)
|
||||
blis-nat-1s: $(BLIS_NAT_1S_BINS)
|
||||
blis-nat-2s: $(BLIS_NAT_2S_BINS)
|
||||
|
||||
openblas-st: $(OPENBLAS_ST_BINS)
|
||||
openblas-1s: $(OPENBLAS_1S_BINS)
|
||||
openblas-2s: $(OPENBLAS_2S_BINS)
|
||||
|
||||
eigen-st: $(EIGEN_ST_BINS)
|
||||
eigen-1s: $(EIGEN_1S_BINS)
|
||||
eigen-2s: $(EIGEN_2S_BINS)
|
||||
|
||||
vendor-st: $(VENDOR_ST_BINS)
|
||||
vendor-1s: $(VENDOR_1S_BINS)
|
||||
vendor-2s: $(VENDOR_2S_BINS)
|
||||
|
||||
mkl-st: vendor-st
|
||||
mkl-1s: vendor-1s
|
||||
mkl-2s: vendor-2s
|
||||
|
||||
armpl-st: vendor-st
|
||||
armpl-1s: vendor-1s
|
||||
armpl-2s: vendor-2s
|
||||
|
||||
# Mark the object files as intermediate so that make will remove them
|
||||
# automatically after building the binaries on which they depend.
|
||||
.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
|
||||
.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
|
||||
.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS)
|
||||
.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS)
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
|
||||
# $(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# A function to return the datatype cpp macro def from the datatype
|
||||
# character.
|
||||
get-dt-cpp = $(strip \
|
||||
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
|
||||
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
|
||||
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
|
||||
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
|
||||
|
||||
# A function to return other cpp macros that help the test driver
|
||||
# identify the implementation.
|
||||
#get-bl-cpp = $(strip \
|
||||
# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
|
||||
# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
|
||||
# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
|
||||
# $(STR_VEN) $(BLA_DEF)))))
|
||||
|
||||
get-bl-cpp = $(strip \
|
||||
$(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
|
||||
$(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
|
||||
$(if $(and $(findstring eigen,$(1)),\
|
||||
$(findstring gemm,$(2))),\
|
||||
$(STR_EIG) $(EIG_DEF),\
|
||||
$(if $(findstring eigen,$(1)),\
|
||||
$(STR_EIG) $(BLA_DEF),\
|
||||
$(STR_VEN) $(BLA_DEF))))))
|
||||
|
||||
|
||||
# Rules for BLIS and BLAS libraries.
|
||||
define make-st-rule
|
||||
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-1s-rule
|
||||
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-2s-rule
|
||||
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
# Rules for Eigen.
|
||||
define make-eigst-rule
|
||||
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-eig1s-rule
|
||||
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-eig2s-rule
|
||||
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
|
||||
# -- Executable file rules --
|
||||
|
||||
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
|
||||
# on the link command line in case BLIS was configured with the BLAS
|
||||
# compatibility layer. This prevents BLIS from inadvertently getting called
|
||||
# for the BLAS routines we are trying to test with.
|
||||
|
||||
test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
clean: cleanx
|
||||
|
||||
cleanx:
|
||||
- $(RM_F) *.o *.x
|
||||
|
||||
@@ -65,16 +65,15 @@ elif [ ${sys} = "ul264" ]; then
|
||||
fi
|
||||
|
||||
# Datatypes to test.
|
||||
test_dts="d s z c"
|
||||
test_dts="d " #s z c"
|
||||
|
||||
# Operations to test.
|
||||
test_ops="gemm hemm herk trmm trsm"
|
||||
test_ops="gemm "#hemm herk trmm trsm"
|
||||
|
||||
# Implementations to test.
|
||||
#impls="blis"
|
||||
#impls="other"
|
||||
impls="eigen"
|
||||
#impls="all"
|
||||
#impls="other"
|
||||
impls="blis"
|
||||
|
||||
if [ "${impls}" = "blis" ]; then
|
||||
|
||||
|
||||
@@ -1,418 +1,418 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#ifdef EIGEN
|
||||
#define BLIS_DISABLE_BLAS_DEFS
|
||||
#include "blis.h"
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/src/misc/blas.h>
|
||||
using namespace Eigen;
|
||||
#else
|
||||
#include "blis.h"
|
||||
#endif
|
||||
|
||||
#define COL_STORAGE
|
||||
//#define ROW_STORAGE
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_max, p_inc;
|
||||
int m_input, n_input, k_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
#if 1
|
||||
p_begin = P_BEGIN;
|
||||
p_max = P_MAX;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 40;
|
||||
p_max = 1000;
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#endif
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
//for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
for ( p = p_max; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
#else
|
||||
bli_obj_create( dt, m, k, k, 1, &a );
|
||||
bli_obj_create( dt, k, n, n, 1, &b );
|
||||
bli_obj_create( dt, m, n, n, 1, &c );
|
||||
bli_obj_create( dt, m, n, n, 1, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0 //def BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN
|
||||
double alpha_r, alpha_i;
|
||||
|
||||
bli_getsc( &alpha, &alpha_r, &alpha_i );
|
||||
|
||||
void* ap = bli_obj_buffer_at_off( &a );
|
||||
void* bp = bli_obj_buffer_at_off( &b );
|
||||
void* cp = bli_obj_buffer_at_off( &c );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
const int os_a = bli_obj_col_stride( &a );
|
||||
const int os_b = bli_obj_col_stride( &b );
|
||||
const int os_c = bli_obj_col_stride( &c );
|
||||
#else
|
||||
const int os_a = bli_obj_row_stride( &a );
|
||||
const int os_b = bli_obj_row_stride( &b );
|
||||
const int os_c = bli_obj_row_stride( &c );
|
||||
#endif
|
||||
|
||||
Stride<Dynamic,1> stride_a( os_a, 1 );
|
||||
Stride<Dynamic,1> stride_b( os_b, 1 );
|
||||
Stride<Dynamic,1> stride_c( os_c, 1 );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
|
||||
#endif
|
||||
#else
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
|
||||
#endif
|
||||
#endif
|
||||
#if defined(IS_FLOAT)
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DOUBLE)
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#if defined(BLIS)
|
||||
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#elif defined(EIGEN)
|
||||
|
||||
C.noalias() += alpha_r * A * B;
|
||||
|
||||
#else // if defined(BLAS)
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = ( float* )bli_obj_buffer( &alpha );
|
||||
float* ap = ( float* )bli_obj_buffer( &a );
|
||||
float* bp = ( float* )bli_obj_buffer( &b );
|
||||
float* betap = ( float* )bli_obj_buffer( &beta );
|
||||
float* cp = ( float* )bli_obj_buffer( &c );
|
||||
|
||||
sgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = ( double* )bli_obj_buffer( &alpha );
|
||||
double* ap = ( double* )bli_obj_buffer( &a );
|
||||
double* bp = ( double* )bli_obj_buffer( &b );
|
||||
double* betap = ( double* )bli_obj_buffer( &beta );
|
||||
double* cp = ( double* )bli_obj_buffer( &c );
|
||||
|
||||
dgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
|
||||
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
|
||||
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
|
||||
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
|
||||
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
|
||||
|
||||
cgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
|
||||
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
|
||||
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
|
||||
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
|
||||
|
||||
zgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#ifdef EIGEN
|
||||
#define BLIS_DISABLE_BLAS_DEFS
|
||||
#include "blis.h"
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/src/misc/blas.h>
|
||||
using namespace Eigen;
|
||||
#else
|
||||
#include "blis.h"
|
||||
#endif
|
||||
|
||||
#define COL_STORAGE
|
||||
//#define ROW_STORAGE
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_max, p_inc;
|
||||
int m_input, n_input, k_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
#if 1
|
||||
p_begin = P_BEGIN;
|
||||
p_max = P_MAX;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 40;
|
||||
p_max = 1000;
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#endif
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
//for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
for ( p = p_max; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
#else
|
||||
bli_obj_create( dt, m, k, k, 1, &a );
|
||||
bli_obj_create( dt, k, n, n, 1, &b );
|
||||
bli_obj_create( dt, m, n, n, 1, &c );
|
||||
bli_obj_create( dt, m, n, n, 1, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0 //def BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN
|
||||
double alpha_r, alpha_i;
|
||||
|
||||
bli_getsc( &alpha, &alpha_r, &alpha_i );
|
||||
|
||||
void* ap = bli_obj_buffer_at_off( &a );
|
||||
void* bp = bli_obj_buffer_at_off( &b );
|
||||
void* cp = bli_obj_buffer_at_off( &c );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
const int os_a = bli_obj_col_stride( &a );
|
||||
const int os_b = bli_obj_col_stride( &b );
|
||||
const int os_c = bli_obj_col_stride( &c );
|
||||
#else
|
||||
const int os_a = bli_obj_row_stride( &a );
|
||||
const int os_b = bli_obj_row_stride( &b );
|
||||
const int os_c = bli_obj_row_stride( &c );
|
||||
#endif
|
||||
|
||||
Stride<Dynamic,1> stride_a( os_a, 1 );
|
||||
Stride<Dynamic,1> stride_b( os_b, 1 );
|
||||
Stride<Dynamic,1> stride_c( os_c, 1 );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
|
||||
#endif
|
||||
#else
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
|
||||
#endif
|
||||
#endif
|
||||
#if defined(IS_FLOAT)
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DOUBLE)
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#if defined(BLIS)
|
||||
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#elif defined(EIGEN)
|
||||
|
||||
C.noalias() += alpha_r * A * B;
|
||||
|
||||
#else // if defined(BLAS)
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = ( float* )bli_obj_buffer( &alpha );
|
||||
float* ap = ( float* )bli_obj_buffer( &a );
|
||||
float* bp = ( float* )bli_obj_buffer( &b );
|
||||
float* betap = ( float* )bli_obj_buffer( &beta );
|
||||
float* cp = ( float* )bli_obj_buffer( &c );
|
||||
|
||||
sgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = ( double* )bli_obj_buffer( &alpha );
|
||||
double* ap = ( double* )bli_obj_buffer( &a );
|
||||
double* bp = ( double* )bli_obj_buffer( &b );
|
||||
double* betap = ( double* )bli_obj_buffer( &beta );
|
||||
double* cp = ( double* )bli_obj_buffer( &c );
|
||||
|
||||
dgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
|
||||
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
|
||||
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
|
||||
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
|
||||
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
|
||||
|
||||
cgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
|
||||
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
|
||||
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
|
||||
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
|
||||
|
||||
zgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ endif
|
||||
BLAS_LIB_PATH := $(HOME)/flame/lib
|
||||
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#MKL_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
|
||||
|
||||
# OpenBLAS
|
||||
@@ -115,11 +115,11 @@ MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
|
||||
# ESSL
|
||||
# Note: ESSL is named differently for SMP and/or BG
|
||||
ESSL_TYPE := # This is the 32b library on POWER
|
||||
#ESSL_TYPE := # This is the 32b library on POWER
|
||||
#ESSL_TYPE := 6464 # This is the 64b library on POWER
|
||||
#ESSL_TYPE := bg # This is the 32b single-threaded library on Blue Gene
|
||||
#ESSL_TYPE := smpbg # This is the 32b multi-threaded library on Blue Gene
|
||||
ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
|
||||
#ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
|
||||
|
||||
# Accelerate
|
||||
MAC_LIB := -framework Accelerate
|
||||
@@ -165,22 +165,21 @@ CFLAGS += -I$(TEST_SRC_PATH)
|
||||
#all: blis openblas atlas mkl
|
||||
all: blis openblas mkl
|
||||
|
||||
blis: test_dotv_blis.x \
|
||||
test_axpyv_blis.x \
|
||||
test_gemv_blis.x \
|
||||
test_ger_blis.x \
|
||||
test_hemv_blis.x \
|
||||
test_her_blis.x \
|
||||
test_her2_blis.x \
|
||||
test_trmv_blis.x \
|
||||
test_trsv_blis.x \
|
||||
\
|
||||
test_gemm_blis.x \
|
||||
test_hemm_blis.x \
|
||||
test_herk_blis.x \
|
||||
test_her2k_blis.x \
|
||||
test_trmm_blis.x \
|
||||
test_trsm_blis.x
|
||||
blis: test_gemm_blis.x \
|
||||
# test_dotv_blis.x \
|
||||
# test_axpyv_blis.x \
|
||||
# test_gemv_blis.x \
|
||||
# test_ger_blis.x \
|
||||
# test_hemv_blis.x \
|
||||
# test_her_blis.x \
|
||||
# test_her2_blis.x \
|
||||
# test_trmv_blis.x \
|
||||
# test_trsv_blis.x \
|
||||
# test_hemm_blis.x \
|
||||
# test_herk_blis.x \
|
||||
# test_her2k_blis.x \
|
||||
# test_trmm_blis.x \
|
||||
# test_trsm_blis.x
|
||||
|
||||
openblas: \
|
||||
test_dotv_openblas.x \
|
||||
|
||||
0
test/output_gemm_blis.m
Normal file
0
test/output_gemm_blis.m
Normal file
@@ -5,12 +5,14 @@ out_root="output"
|
||||
#out_root="output_square"
|
||||
|
||||
# Operations to test.
|
||||
l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm hemm herk her2k trmm trsm"
|
||||
test_ops="${l2_ops} ${l3_ops}"
|
||||
# l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm"
|
||||
# "hemm herk her2k trmm trsm"
|
||||
test_ops=" ${l3_ops}"
|
||||
# "${l2_ops}"
|
||||
|
||||
# Implementations to test
|
||||
test_impls="openblas atlas mkl blis"
|
||||
# Implementations to test | "openblas atlas mkl"
|
||||
test_impls="blis"
|
||||
|
||||
for im in ${test_impls}; do
|
||||
|
||||
@@ -22,7 +24,7 @@ for im in ${test_impls}; do
|
||||
# Construct the name of the output file.
|
||||
out_file="${out_root}_${op}_${im}.m"
|
||||
|
||||
echo "Running ${exec_name} > ${out_file}"
|
||||
echo " Running ${exec_name} > ${out_file} "
|
||||
|
||||
# Run executable.
|
||||
./${exec_name} > ${out_file}
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
# accepted values.
|
||||
#
|
||||
|
||||
1 # Number of repeats per experiment (best result is reported)
|
||||
rc # Matrix storage scheme(s) to test:
|
||||
3 # Number of repeats per experiment (best result is reported)
|
||||
c # Matrix storage scheme(s) to test:
|
||||
# 'c' = col-major storage; 'g' = general stride storage;
|
||||
# 'r' = row-major storage
|
||||
cj # Vector storage scheme(s) to test:
|
||||
@@ -22,14 +22,14 @@ cj # Vector storage scheme(s) to test:
|
||||
# '0' = real values on [-1,1];
|
||||
# '1' = powers of 2 in narrow precision range
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
sdcz # Datatype(s) to test:
|
||||
d # Datatype(s) to test:
|
||||
# 's' = single real; 'c' = single complex;
|
||||
# 'd' = double real; 'z' = double complex
|
||||
0 # Test gemm with mixed-domain operands?
|
||||
0 # Test gemm with mixed-precision operands?
|
||||
100 # Problem size: first to test
|
||||
500 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
2000 # Problem size: first to test
|
||||
2000 # Problem size: maximum to test
|
||||
200 # Problem size: increment between experiments
|
||||
# Complex level-3 implementations to test:
|
||||
0 # 3mh ('1' = enable; '0' = disable)
|
||||
0 # 3m1 ('1' = enable; '0' = disable)
|
||||
@@ -45,5 +45,5 @@ sdcz # Datatype(s) to test:
|
||||
# '0' = disable error checking; '1' = full error checking
|
||||
i # Reaction to test failure:
|
||||
# 'i' = ignore; 's' = sleep() and continue; 'a' = abort
|
||||
0 # Output results in matlab/octave format? ('1' = yes; '0' = no)
|
||||
1 # Output results in matlab/octave format? ('1' = yes; '0' = no)
|
||||
0 # Output results to stdout AND files? ('1' = yes; '0' = no)
|
||||
|
||||
@@ -276,9 +276,9 @@
|
||||
|
||||
# --- Level-3 --------------------------------------------------------------
|
||||
|
||||
1 # gemm
|
||||
-1 -1 -1 # dimensions: m n k
|
||||
?? # parameters: transa transb
|
||||
2 # gemm
|
||||
-1 -1 -1 # dimensions: m n k
|
||||
nn # parameters: transa transb
|
||||
|
||||
1 # hemm
|
||||
-1 -1 # dimensions: m n
|
||||
|
||||
106
testsuite/jobscripts/cfig.out
Normal file
106
testsuite/jobscripts/cfig.out
Normal file
@@ -0,0 +1,106 @@
|
||||
configure: detected Linux kernel version 4.14.0-115.6.1.el7a.ppc64le.
|
||||
configure: python interpeter search list is: python python3 python2.
|
||||
configure: using 'python' python interpreter.
|
||||
configure: found python version 2.7.5 (maj: 2, min: 7, rev: 5).
|
||||
configure: python 2.7.5 appears to be supported.
|
||||
configure: C compiler search list is: gcc clang cc.
|
||||
configure: using 'gcc' C compiler.
|
||||
configure: C++ compiler search list is: g++ clang++ c++.
|
||||
configure: using 'g++' C++ compiler (for sandbox only).
|
||||
configure: found gcc version 8.2.0 (maj: 8, min: 2, rev: 0).
|
||||
configure: checking for blacklisted configurations due to gcc 8.2.0.
|
||||
configure: found assembler ('as') version 2.27 (maj: 2, min: 27, rev: ).
|
||||
configure: checking for blacklisted configurations due to as 2.27.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'bulldozer'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'sandybridge'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'haswell'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'piledriver'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'steamroller'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'excavator'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'skx'; adding to blacklist.
|
||||
configure: warning: assembler ('as' 2.27) does not support 'knl'; adding to blacklist.
|
||||
configure: configuration blacklist:
|
||||
configure: bulldozer sandybridge haswell piledriver steamroller excavator skx knl
|
||||
configure: reading configuration registry...done.
|
||||
configure: determining default version string.
|
||||
configure: found '.git' directory; assuming git clone.
|
||||
configure: executing: git describe --tags.
|
||||
configure: git returned an error: 'Unknown option: -C
|
||||
usage: git [--version] [--help] [-c name=value]
|
||||
[--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
|
||||
[-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
|
||||
[--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
|
||||
<command> [<args>]'.
|
||||
configure: using string from unmodified version file.
|
||||
configure: starting configuration of BLIS 0.6.0.
|
||||
configure: configuring with official version string.
|
||||
configure: found shared library .so version '2.0.0'.
|
||||
configure: .so major version: 2
|
||||
configure: .so minor.build version: 0.0
|
||||
configure: manual configuration requested; configuring with 'power9'.
|
||||
configure: checking configuration against contents of 'config_registry'.
|
||||
configure: configuration 'power9' is registered.
|
||||
configure: 'power9' is defined as having the following sub-configurations:
|
||||
configure: power9
|
||||
configure: which collectively require the following kernels:
|
||||
configure: power9
|
||||
configure: checking sub-configurations:
|
||||
configure: 'power9' is registered...and exists.
|
||||
configure: checking sub-configurations' requisite kernels:
|
||||
configure: 'power9' kernels...exist.
|
||||
configure: no install prefix option given; defaulting to '/usr/local'.
|
||||
configure: no install exec_prefix option given; defaulting to PREFIX.
|
||||
configure: no install libdir option given; defaulting to EXECPREFIX/lib.
|
||||
configure: no install includedir option given; defaulting to PREFIX/include.
|
||||
configure: no install sharedir option given; defaulting to PREFIX/share.
|
||||
configure: final installation directories:
|
||||
configure: prefix: /usr/local
|
||||
configure: exec_prefix: ${prefix}
|
||||
configure: libdir: ${exec_prefix}/lib
|
||||
configure: includedir: ${prefix}/include
|
||||
configure: sharedir: ${prefix}/share
|
||||
configure: NOTE: the variables above can be overridden when running make.
|
||||
configure: no preset CFLAGS detected.
|
||||
configure: no preset LDFLAGS detected.
|
||||
configure: debug symbols disabled.
|
||||
configure: disabling verbose make output. (enable with 'make V=1'.)
|
||||
configure: disabling ARG_MAX hack.
|
||||
configure: building BLIS as both static and shared libraries.
|
||||
configure: exporting only public symbols within shared library.
|
||||
configure: threading is disabled.
|
||||
configure: requesting slab threading in jr and ir loops.
|
||||
configure: internal memory pools for packing blocks are enabled.
|
||||
configure: internal memory pools for small blocks are enabled.
|
||||
configure: memory tracing output is disabled.
|
||||
configure: libmemkind not found; disabling.
|
||||
configure: compiler appears to not support #pragma omp simd.
|
||||
configure: the BLAS compatibility layer is enabled.
|
||||
configure: the CBLAS compatibility layer is disabled.
|
||||
configure: mixed datatype support is enabled.
|
||||
configure: mixed datatype optimizations requiring extra memory are enabled.
|
||||
configure: small matrix handling is enabled.
|
||||
configure: the BLIS API integer size is automatically determined.
|
||||
configure: the BLAS/CBLAS API integer size is 32-bit.
|
||||
configure: configuring for conventional gemm implementation.
|
||||
configure: creating ./config.mk from ./build/config.mk.in
|
||||
configure: creating ./bli_config.h from ./build/bli_config.h.in
|
||||
configure: creating ./obj/power9
|
||||
configure: creating ./obj/power9/config/power9
|
||||
configure: creating ./obj/power9/kernels/power9
|
||||
configure: creating ./obj/power9/ref_kernels/power9
|
||||
configure: creating ./obj/power9/frame
|
||||
configure: creating ./obj/power9/blastest
|
||||
configure: creating ./obj/power9/testsuite
|
||||
configure: creating ./lib/power9
|
||||
configure: creating ./include/power9
|
||||
configure: mirroring ./config/power9 to ./obj/power9/config/power9
|
||||
configure: mirroring ./kernels/power9 to ./obj/power9/kernels/power9
|
||||
configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels
|
||||
configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels/power9
|
||||
configure: mirroring ./frame to ./obj/power9/frame
|
||||
configure: creating makefile fragments in ./obj/power9/config/power9
|
||||
configure: creating makefile fragments in ./obj/power9/kernels/power9
|
||||
configure: creating makefile fragments in ./obj/power9/ref_kernels
|
||||
configure: creating makefile fragments in ./obj/power9/frame
|
||||
configure: configured to build within top-level directory of source distribution.
|
||||
CONFIGURE DONE
|
||||
5
testsuite/jobscripts/cfig.sh
Executable file
5
testsuite/jobscripts/cfig.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd ~/blis
|
||||
./configure power9
|
||||
echo "CONFIGURE DONE"
|
||||
26
testsuite/jobscripts/jb-cfig.sh
Normal file
26
testsuite/jobscripts/jb-cfig.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
# execute in the general partition
|
||||
#SBATCH --partition=general
|
||||
|
||||
# execute with 40 processes/tasks
|
||||
#SBATCH --ntasks=1
|
||||
|
||||
# maximum time is 30 minutes
|
||||
#SBATCH --time=00:30:00
|
||||
|
||||
# job name is my_job
|
||||
#SBATCH --job-name=blis
|
||||
|
||||
# send email for status updates
|
||||
#SBATCH --mail-type=ALL,TIME_LIMIT
|
||||
#SBATCH --mail-user=ntukanov
|
||||
|
||||
# change default output file name
|
||||
#SBATCH --output=cfig.out
|
||||
|
||||
# load environment
|
||||
module load gcc/8.2
|
||||
|
||||
# application execution
|
||||
srun cfig.sh
|
||||
26
testsuite/jobscripts/jb-mk.sh
Normal file
26
testsuite/jobscripts/jb-mk.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
# execute in the general partition
|
||||
#SBATCH --partition=general
|
||||
|
||||
# execute with 40 processes/tasks
|
||||
#SBATCH --ntasks=1
|
||||
|
||||
# maximum time is 30 minutes
|
||||
#SBATCH --time=00:30:00
|
||||
|
||||
# job name is my_job
|
||||
#SBATCH --job-name=blis
|
||||
|
||||
# send email for status updates
|
||||
#SBATCH --mail-type=ALL,TIME_LIMIT
|
||||
#SBATCH --mail-user=ntukanov
|
||||
|
||||
# change default output file name
|
||||
#SBATCH --output=mk.out
|
||||
|
||||
# load environment
|
||||
module load gcc/8.2
|
||||
|
||||
# application execution
|
||||
srun mk.sh
|
||||
26
testsuite/jobscripts/jb-runtest.sh
Normal file
26
testsuite/jobscripts/jb-runtest.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
# execute in the general partition
|
||||
#SBATCH --partition=general
|
||||
|
||||
# execute with 40 processes/tasks
|
||||
#SBATCH --ntasks=1
|
||||
|
||||
# maximum time is 30 minutes
|
||||
#SBATCH --time=00:30:00
|
||||
|
||||
# job name is my_job
|
||||
#SBATCH --job-name=blis
|
||||
|
||||
# send email for status updates
|
||||
#SBATCH --mail-type=ALL,TIME_LIMIT
|
||||
#SBATCH --mail-user=ntukanov
|
||||
|
||||
# change default output file name
|
||||
#SBATCH --output=runtest.out
|
||||
|
||||
# load environment
|
||||
module load gcc/8.2
|
||||
|
||||
# application execution
|
||||
srun runtest.sh
|
||||
9
testsuite/jobscripts/mk.out
Normal file
9
testsuite/jobscripts/mk.out
Normal file
@@ -0,0 +1,9 @@
|
||||
Removing flattened header files from include/power9
|
||||
Removing object files from ./obj/power9
|
||||
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
|
||||
srun: got SIGCONT
|
||||
slurmstepd: error: *** JOB 1155 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 ***
|
||||
srun: forcing job termination
|
||||
slurmstepd: error: *** STEP 1155.0 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 ***
|
||||
make: *** [cleanlib] Terminated
|
||||
srun: error: lookout00: task 0: Terminated
|
||||
6
testsuite/jobscripts/mk.sh
Executable file
6
testsuite/jobscripts/mk.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd ~/blis
|
||||
make clean
|
||||
make
|
||||
echo "MAKE DONE"
|
||||
8
testsuite/jobscripts/runtest.sh
Executable file
8
testsuite/jobscripts/runtest.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd ~/blis/testsuite
|
||||
rm -rf test_libblis.out
|
||||
make clean
|
||||
make -j
|
||||
./test_libblis.x > test_libblis.out
|
||||
echo "TEST DONE"
|
||||
@@ -272,11 +272,23 @@ void libblis_test_gemm_experiment
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
#if 0
|
||||
bli_printm( "alpha", &alpha, "%5.2f", "" );
|
||||
bli_printm( "beta", &beta, "%5.2f", "" );
|
||||
bli_printm( "a = [", &a, "%7.6f", "];" );
|
||||
bli_printm( "b = [", &b, "%7.6f", "];" );
|
||||
bli_printm( "c = [", &c, "%7.6f", "];" );
|
||||
#endif
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
#if 0
|
||||
bli_printm( "c_after = [", &c, "%7.6f", "];" );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -405,7 +417,6 @@ void libblis_test_gemm_md
|
||||
|
||||
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -442,20 +453,18 @@ void libblis_test_gemm_impl
|
||||
{
|
||||
case BLIS_TEST_SEQ_FRONT_END:
|
||||
#if 0
|
||||
//bli_printm( "alpha", alpha, "%5.2f", "" );
|
||||
//bli_printm( "beta", beta, "%5.2f", "" );
|
||||
bli_printm( "a", a, "%5.2f", "" );
|
||||
bli_printm( "b", b, "%5.2f", "" );
|
||||
bli_printm( "c", c, "%5.2f", "" );
|
||||
bli_printm( "alpha", alpha, "%5.2f", "" );
|
||||
bli_printm( "beta", beta, "%5.2f", "" );
|
||||
bli_printm( "a", a, "%6.3f", "" );
|
||||
bli_printm( "b", b, "%6.3f", "" );
|
||||
bli_printm( "c", c, "%6.3f", "" );
|
||||
#endif
|
||||
//if ( bli_obj_length( b ) == 16 &&
|
||||
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
|
||||
//bli_printm( "c before", c, "%6.3f", "" );
|
||||
bli_gemm( alpha, a, b, beta, c );
|
||||
#if 0
|
||||
if ( bli_obj_length( c ) == 12 &&
|
||||
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
|
||||
bli_printm( "c after", c, "%6.3f", "" );
|
||||
bli_printm( "c after", c, "%6.3f", "");
|
||||
#endif
|
||||
break;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user