POWER9 DGEMM (#355)

Implemented and registered power9 dgemm ukernel.

Details:
- Implemented 12x6 dgemm microkernel for power9. This microkernel 
  assumes that elements of B have been duplicated/broadcast during the
  packing step. The microkernel uses a column orientation for its 
  microtile vector registers and thus implements column storage and 
  general stride IO cases. (A row storage IO case via in-register
  transposition may be added at a future date.) It should be noted that 
  we recommend using this microkernel with gcc and *not* xlc, as issues 
  with the latter cropped up during development, including but not 
  limited to slightly incompatible vector register mnemonics in the GNU 
  extended inline assembly clobber list.
This commit is contained in:
Nicholai Tukanov
2019-11-01 17:57:03 -05:00
committed by Field G. Van Zee
parent 58102aeaa2
commit b426f9e04e
31 changed files with 3049 additions and 561 deletions

View File

@@ -34,44 +34,55 @@
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref )
// Instantiate prototypes for level-3 kernels.
//GEMM_UKR_PROT( double, d, gemmbb_power9_ref )
void bli_cntx_init_power9( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_power9_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
// bli_cntx_set_l3_nat_ukrs
// (
// 1,
// BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE,
// cntx
// );
/*
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 );
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
cntx
);
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
-1, 12, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
*/
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}

View File

@@ -32,34 +32,10 @@
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
//#define BLIS_SIMD_NUM_REGISTERS 32
//#define BLIS_SIMD_SIZE 64
//
//#ifdef BLIS_NO_HBWMALLOC
// #include <stdlib.h>
// #define BLIS_MALLOC_POOL malloc
// #define BLIS_FREE_POOL free
//#else
// #include <hbwmalloc.h>
// #define BLIS_MALLOC_POOL hbw_malloc
// #define BLIS_FREE_POOL hbw_free
//#endif
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 192
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 152
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#endif
//#endif

View File

@@ -1,3 +1,4 @@
#
#
# BLIS
@@ -45,8 +46,8 @@ THIS_CONFIG := power9
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mcpu=power9
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
@@ -57,28 +58,25 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -funroll-loops
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS :=
CKVECFLAGS := -mcpu=power9 -mtune=power9 -DXLC=0
else
$(error gcc is required for this configuration.)
ifeq ($(CC_VENDOR),IBM)
CKVECFLAGS := -qarch=pwr9 -qtune=pwr9 -DXLC=1
else
$(info $(CC_VENDOR))
$(error gcc/xlc is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -39,7 +39,7 @@ cortexa15: cortexa15/armv7a
cortexa9: cortexa9/armv7a
# IBM architectures.
power9: power9/generic
power9: power9
bgq: bgq
# Generic architectures.

View File

@@ -66,6 +66,7 @@ void bli_gemm_front
#endif
#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -82,6 +83,7 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
@@ -148,6 +150,7 @@ void bli_gemm_front
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
@@ -275,6 +278,7 @@ void bli_gemm_front
cntl
);
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If we created a temporary matrix conformal to C for whatever reason,

View File

@@ -167,7 +167,7 @@ void bli_gemm_ker_var2
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,

View File

@@ -141,15 +141,15 @@ void bli_arch_set_id( void )
#endif
// IBM microarchitectures.
#ifdef BLIS_FAMILY_POWER9
id = BLIS_ARCH_POWER9;
#endif
#ifdef BLIS_FAMILY_POWER7
id = BLIS_ARCH_POWER7;
#endif
#ifdef BLIS_FAMILY_BGQ
id = BLIS_ARCH_BGQ;
#endif
#ifdef BLIS_FAMILY_POWER9
id = BLIS_ARCH_POWER9;
#endif
// Generic microarchitecture.
#ifdef BLIS_FAMILY_GENERIC
@@ -188,9 +188,9 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
"cortexa15",
"cortexa9",
"power9",
"power7",
"bgq",
"power9",
"generic"
};

View File

@@ -1,13 +1,10 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
@@ -19,7 +16,6 @@
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,7 +27,6 @@
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CNTX_H
@@ -45,28 +40,22 @@ typedef struct cntx_s
{
blksz_t* blkszs;
bszid_t* bmults;
func_t* l3_vir_ukrs;
func_t* l3_nat_ukrs;
mbool_t* l3_nat_ukrs_prefs;
blksz_t* l3_sup_thresh;
void** l3_sup_handlers;
blksz_t* l3_sup_blkszs;
func_t* l3_sup_kers;
mbool_t* l3_sup_kers_prefs;
func_t* l1f_kers;
func_t* l1v_kers;
func_t* packm_kers;
func_t* unpackm_kers;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
} cntx_t;
*/

View File

@@ -268,6 +268,9 @@ CNTX_INIT_PROTS( generic )
// -- IBM BG/Q --
#ifdef BLIS_KERNELS_POWER9
#include "bli_kernels_power9.h"
#endif
#ifdef BLIS_KERNELS_POWER7
#include "bli_kernels_power7.h"
#endif

View File

@@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 )
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 )
// gemm (asm d8x6)
// gemm (asm d8x6)
//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 )
//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 )
//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )

View File

@@ -0,0 +1,201 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_pwr9_asm_macros_12x6.h"
void bli_dgemm_power9_asm_12x6
(
dim_t k0,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
uint64_t k_iter = k0 / 16;
uint64_t k_left = k0 % 16;
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
__asm__ volatile
(
" \n\t"
"ld %%r7, %2 \n\t" // load ptr of A
"ld %%r8, %3 \n\t" // load ptr of B
"ld %%r16, %6 \n\t" // load ptr of C
" \n\t"
"ld %%r28, %4 \n\t" // load ptr for alpha
"ld %%r29, %5 \n\t" // load ptr for beta
" \n\t"
"ld %%r11, %0 \n\t" // load k_iter
"ld %%r12, %1 \n\t" // load k_left
" \n\t"
"ld %%r10, %8 \n\t" // load cs_c
"slwi %%r10, %%r10, 3 \n\t" // mul by size of elem
" \n\t"
"ld %%r9, %7 \n\t" // load rs_c
"slwi %%r9, %%r9, 3 \n\t" // mul by size of elem
" \n\t"
"ld %%r26, 0(%%r29) \n\t" // load val of beta
" \n\t"
"lxvdsx %%vs62, 0, %%r28 \n\t" // splat alpha
"lxvdsx %%vs63, 0, %%r29 \n\t" // splat beta
" \n\t"
"add %%r17, %%r16, %%r10 \n\t" // addr of col 1 of C
"add %%r18, %%r17, %%r10 \n\t" // col 2 of C
"add %%r19, %%r18, %%r10 \n\t" // col 3 of C
"add %%r20, %%r19, %%r10 \n\t" // col 4 of C
"add %%r21, %%r20, %%r10 \n\t" // col 5 of C
" \n\t"
DZERO_OUT_VREG
" \n\t"
DPRELOAD
" \n\t"
"addi %%r8, %%r8, 96 \n\t" // move to next col/row of A/B
"addi %%r7, %%r7, 96 \n\t"
" \n\t"
DPREFETCH
" \n\t"
"cmpwi %%r0, %%r11, 0 \n\t" // if k_iter == 0,
"beq %%r0, DCONSIDERKLEFT \n\t" // then jmp to k_left
"mtctr %%r11 \n\t" // else, do k_iter loop
" \n\t"
"DLOOPKITER: \n\t" // k_iter loop
" \n\t"
A_B_PRODUCT_16 // compute A*B
" \n\t"
"bdnz DLOOPKITER \n\t"
" \n\t"
"DCONSIDERKLEFT: \n\t"
" \n\t"
"cmpwi %%r0, %%r12, 0 \n\t" // if k_left == 0,
"beq %%r0, DPOSTACCUM \n\t" // then jmp to post accum
"mtctr %%r12 \n\t" // else, do k_left loop
" \n\t"
"DLOOPKLEFT: \n\t" // k_left loop
" \n\t"
A_B_PRODUCT_1
" \n\t"
"bdnz DLOOPKLEFT \n\t"
" \n\t"
"DPOSTACCUM: \n\t"
" \n\t"
DSCALE_ALPHA
" \n\t"
"cmpdi %%r0, %%r26, 0 \n\t" // if beta == 0,
"beq %%r0, DBETAZERO \n\t" // then jmp to BZ
" \n\t"
"cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8
"beq DCOLSTOREDBNZ \n\t" // then jmp to col store
" \n\t"
"DGENSTOREDBNZ: \n\t" // BNZ gen stored case
" \n\t"
DGEN_LOAD_OFS_C
" \n\t"
DGEN_SCALE_BETA
" \n\t"
"b DGENSTORED \n\t"
" \n\t"
"DCOLSTOREDBNZ: \n\t" // BNZ col stored case
" \n\t"
DCOL_SCALE_BETA
" \n\t"
"b DCOLSTORED \n\t"
" \n\t"
"DBETAZERO: \n\t" // BZ case
" \n\t"
"cmpwi %%r0, %%r9, 8 \n\t" // if rs_c == 8,
"beq DCOLSTORED \n\t" // C is col stored
" \n\t"
"DGENSTORED: \n\t" // BZ gen stored case
" \n\t"
DGEN_LOAD_OFS_C
" \n\t"
DGEN_STORE
" \n\t"
"b DDONE \n\t"
" \n\t"
"DCOLSTORED: \n\t" // BZ col stored case
" \n\t"
DCOL_STORE
" \n\t"
"DDONE: \n\t"
" \n\t"
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
: // register clobber list
/* unclobberable regs: r2, r3, r4, r5, r6, r13, r14, r15, r30, r31 */
"r0", "r7", "r8", "r9",
"r10", "r11", "r12", "r16", "r17", "r18", "r19",
"r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29"
#if XLC
,"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"
, "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19"
, "f20" ,"f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29"
, "f30" ,"f31"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"
, "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"
, "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
, "v30", "v31"
#else
, "vs0", "vs1", "vs2", "vs3", "vs4", "vs5", "vs6", "vs7", "vs8", "vs9"
, "vs10", "vs11", "vs12", "vs13", "vs14", "vs15", "vs16", "vs17", "vs18", "vs19"
, "vs20", "vs21", "vs22", "vs23", "vs24", "vs25", "vs26", "vs27", "vs28", "vs29"
, "vs30", "vs31", "vs32", "vs33", "vs34", "vs35", "vs36", "vs37", "vs38", "vs39"
, "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49"
, "vs50", "vs51", "vs52", "vs53"
#endif
);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- level-3 --
// gemm (asm d12x6)
GEMM_UKR_PROT( double, d, gemm_power9_asm_12x6 )
// gemm (asm d18x4)
GEMM_UKR_PROT( double, d, gemm_power9_asm_18x4 )
// gemm (asm d4x16)
GEMM_UKR_PROT( double, d, gemm_power9_asm_16x4 )
// gemm (asm d4x16)
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )

View File

@@ -130,9 +130,9 @@ VENDORP_LIB := $(MKLP_LIB)
#
# Single core (single-threaded)
PS_BEGIN := 48
PS_MAX := 2400
PS_INC := 48
PS_BEGIN := 100
PS_MAX := 1000
PS_INC := 100
# Single-socket (multithreaded)
P1_BEGIN := 96
@@ -242,8 +242,8 @@ blis-2s: blis-nat-2s
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
# Define the datatypes, operations, and implementations.
DTS := s d c z
OPS := gemm hemm herk trmm trsm
DTS := d # s d c z
OPS := gemm # hemm herk trmm trsm
BIMPLS := asm_blis openblas vendor
EIMPLS := eigen

464
test/3/Makefile_cpy1 Normal file
View File

@@ -0,0 +1,464 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
#
# Makefile
#
# Field G. Van Zee
#
# Makefile for standalone BLIS test drivers.
#
#
# --- Makefile PHONY target definitions ----------------------------------------
#
.PHONY: all \
clean cleanx
#
# --- Determine makefile fragment location -------------------------------------
#
# Comments:
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
# the second case because CONFIG_NAME is not yet set.
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
else
DIST_PATH := ../..
LIB_PATH = ../../lib/$(CONFIG_NAME)
INC_PATH = ../../include/$(CONFIG_NAME)
SHARE_PATH := ../..
endif
#
# --- Include common makefile definitions --------------------------------------
#
# Include the common makefile fragment.
-include $(SHARE_PATH)/common.mk
#
# --- BLAS implementations -----------------------------------------------------
#
# BLAS library path(s). This is where the BLAS libraries reside.
HOME_LIB_PATH := $(HOME)/flame/lib
#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
# OpenBLAS
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
# OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
# ATLAS
#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
# $(HOME_LIB_PATH)/libatlas.a
# Eigen
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
EIGENP_LIB := $(EIGEN_LIB)
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_sequential \
-lpthread -lm -ldl
#MKLP_LIB := -L$(MKL_LIB_PATH) \
# -lmkl_intel_thread \
# -lmkl_core \
# -lmkl_intel_ilp64 \
# -L$(ICC_LIB_PATH) \
# -liomp5
# MKLP_LIB := -L$(MKL_LIB_PATH) \
# -lmkl_intel_lp64 \
# -lmkl_core \
# -lmkl_gnu_thread \
# -lpthread -lm -ldl -fopenmp
# #-L$(ICC_LIB_PATH) \
# #-lgomp
VENDOR_LIB := $(MKL_LIB)
VENDORP_LIB := $(MKLP_LIB)
#
# --- Problem size definitions -------------------------------------------------
#
# Single core (single-threaded)
PS_BEGIN := 100
PS_MAX := 1000
PS_INC := 100
# Single-socket (multithreaded)
P1_BEGIN := 120
P1_MAX := 6000
P1_INC := 120
# Dual-socket (multithreaded)
P2_BEGIN := 160
P2_MAX := 8000
P2_INC := 160
#
# --- General build definitions ------------------------------------------------
#
TEST_SRC_PATH := .
TEST_OBJ_PATH := .
# Gather all local object files.
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c)))
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
# get-user-cflags-for() is not cluttered up with include paths needed only
# while building BLIS.
CINCFLAGS := -I$(INC_PATH)
# Use the "framework" CFLAGS for the configuration family.
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
# Add local header paths to CFLAGS.
CFLAGS += -I$(TEST_SRC_PATH)
# Locate the libblis library to which we will link.
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Define a set of CFLAGS for use with C++ and Eigen.
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
CXXFLAGS += -I$(EIGEN_INC)
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
CXXFLAGS_MT := -march=native $(CXXFLAGS)
# Which library?
BLI_DEF := -DBLIS
BLA_DEF := -DBLAS
EIG_DEF := -DEIGEN
# Complex implementation type
D3MHW := -DIND=BLIS_3MH
D3M1 := -DIND=BLIS_3M1
D4MHW := -DIND=BLIS_4MH
D4M1B := -DIND=BLIS_4M1B
D4M1A := -DIND=BLIS_4M1A
D1M := -DIND=BLIS_1M
DNAT := -DIND=BLIS_NAT
# Implementation string
#STR_3MHW := -DSTR=\"3mhw\"
#STR_3M1 := -DSTR=\"3m1\"
#STR_4MHW := -DSTR=\"4mhw\"
#STR_4M1B := -DSTR=\"4m1b\"
#STR_4M1A := -DSTR=\"4m1a\"
#STR_1M := -DSTR=\"1m\"
STR_NAT := -DSTR=\"asm_blis\"
STR_OBL := -DSTR=\"openblas\"
STR_EIG := -DSTR=\"eigen\"
STR_VEN := -DSTR=\"vendor\"
# Single or multithreaded string
STR_ST := -DTHR_STR=\"st\"
STR_1S := -DTHR_STR=\"1s\"
STR_2S := -DTHR_STR=\"2s\"
# Problem size specification
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX)
PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
#
# --- Targets/rules ------------------------------------------------------------
#
all: all-st all-1s all-2s
blis: blis-st blis-1s blis-2s
openblas: openblas-st openblas-1s openblas-2s
eigen: eigen-st eigen-1s eigen-2s
vendor: vendor-st vendor-1s vendor-2s
mkl: vendor
armpl: vendor
all-st: blis-st openblas-st mkl-st
all-1s: blis-1s openblas-1s mkl-1s
all-2s: blis-2s openblas-2s mkl-2s
blis-st: blis-nat-st
blis-1s: blis-nat-1s
blis-2s: blis-nat-2s
#blis-ind: blis-ind-st blis-ind-mt
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
# Define the datatypes, operations, and implementations.
DTS := d #s d c z
OPS := gemm #hemm herk trmm trsm
IMPLS := asm_blis openblas vendor
# Define functions to construct object filenames from the datatypes and
# operations given an implementation. We define one function for single-
# threaded, single-socket, and dual-socket filenames.
get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
# Construct object and binary names for single-threaded, single-socket, and
# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
EIGEN_ST_OBJS := $(call get-st-objs,eigen)
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
EIGEN_1S_OBJS := $(call get-1s-objs,eigen)
EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
EIGEN_2S_OBJS := $(call get-2s-objs,eigen)
EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
VENDOR_ST_OBJS := $(call get-st-objs,vendor)
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
VENDOR_1S_OBJS := $(call get-1s-objs,vendor)
VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
VENDOR_2S_OBJS := $(call get-2s-objs,vendor)
VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
# Define some targets associated with the above object/binary files.
blis-nat-st: $(BLIS_NAT_ST_BINS)
blis-nat-1s: $(BLIS_NAT_1S_BINS)
blis-nat-2s: $(BLIS_NAT_2S_BINS)
openblas-st: $(OPENBLAS_ST_BINS)
openblas-1s: $(OPENBLAS_1S_BINS)
openblas-2s: $(OPENBLAS_2S_BINS)
eigen-st: $(EIGEN_ST_BINS)
eigen-1s: $(EIGEN_1S_BINS)
eigen-2s: $(EIGEN_2S_BINS)
vendor-st: $(VENDOR_ST_BINS)
vendor-1s: $(VENDOR_1S_BINS)
vendor-2s: $(VENDOR_2S_BINS)
mkl-st: vendor-st
mkl-1s: vendor-1s
mkl-2s: vendor-2s
armpl-st: vendor-st
armpl-1s: vendor-1s
armpl-2s: vendor-2s
# Mark the object files as intermediate so that make will remove them
# automatically after building the binaries on which they depend.
.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS)
.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS)
# --Object file rules --
#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
# $(CC) $(CFLAGS) -c $< -o $@
# A function to return the datatype cpp macro def from the datatype
# character.
get-dt-cpp = $(strip \
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
# A function to return other cpp macros that help the test driver
# identify the implementation.
#get-bl-cpp = $(strip \
# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
# $(STR_VEN) $(BLA_DEF)))))
get-bl-cpp = $(strip \
$(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
$(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
$(if $(and $(findstring eigen,$(1)),\
$(findstring gemm,$(2))),\
$(STR_EIG) $(EIG_DEF),\
$(if $(findstring eigen,$(1)),\
$(STR_EIG) $(BLA_DEF),\
$(STR_VEN) $(BLA_DEF))))))
# Rules for BLIS and BLAS libraries.
define make-st-rule
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
endef
define make-1s-rule
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
endef
define make-2s-rule
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
endef
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
# Rules for Eigen.
define make-eigst-rule
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
endef
define make-eig1s-rule
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
endef
define make-eig2s-rule
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
endef
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
# -- Executable file rules --
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
# on the link command line in case BLIS was configured with the BLAS
# compatibility layer. This prevents BLIS from inadvertently getting called
# for the BLAS routines we are trying to test with.
test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
# -- Clean rules --
clean: cleanx
cleanx:
- $(RM_F) *.o *.x

View File

@@ -65,16 +65,15 @@ elif [ ${sys} = "ul264" ]; then
fi
# Datatypes to test.
test_dts="d s z c"
test_dts="d " #s z c"
# Operations to test.
test_ops="gemm hemm herk trmm trsm"
test_ops="gemm "#hemm herk trmm trsm"
# Implementations to test.
#impls="blis"
#impls="other"
impls="eigen"
#impls="all"
#impls="other"
impls="blis"
if [ "${impls}" = "blis" ]; then

View File

@@ -1,418 +1,418 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#ifdef EIGEN
#define BLIS_DISABLE_BLAS_DEFS
#include "blis.h"
#include <Eigen/Core>
#include <Eigen/src/misc/blas.h>
using namespace Eigen;
#else
#include "blis.h"
#endif
#define COL_STORAGE
//#define ROW_STORAGE
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_max, p_inc;
int m_input, n_input, k_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
#if 1
p_begin = P_BEGIN;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
n_input = -1;
k_input = -1;
#else
p_begin = 40;
p_max = 1000;
p_inc = 40;
m_input = -1;
n_input = -1;
k_input = -1;
#endif
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_max; p += p_inc )
for ( p = p_max; p_begin <= p; p -= p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
#ifdef COL_STORAGE
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
#else
bli_obj_create( dt, m, k, k, 1, &a );
bli_obj_create( dt, k, n, n, 1, &b );
bli_obj_create( dt, m, n, n, 1, &c );
bli_obj_create( dt, m, n, n, 1, &c_save );
#endif
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
#ifdef EIGEN
double alpha_r, alpha_i;
bli_getsc( &alpha, &alpha_r, &alpha_i );
void* ap = bli_obj_buffer_at_off( &a );
void* bp = bli_obj_buffer_at_off( &b );
void* cp = bli_obj_buffer_at_off( &c );
#ifdef COL_STORAGE
const int os_a = bli_obj_col_stride( &a );
const int os_b = bli_obj_col_stride( &b );
const int os_c = bli_obj_col_stride( &c );
#else
const int os_a = bli_obj_row_stride( &a );
const int os_b = bli_obj_row_stride( &b );
const int os_c = bli_obj_row_stride( &c );
#endif
Stride<Dynamic,1> stride_a( os_a, 1 );
Stride<Dynamic,1> stride_b( os_b, 1 );
Stride<Dynamic,1> stride_c( os_c, 1 );
#ifdef COL_STORAGE
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
#endif
#else
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
#endif
#endif
#if defined(IS_FLOAT)
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
#elif defined (IS_DOUBLE)
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
#elif defined (IS_SCOMPLEX)
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
#elif defined (IS_DCOMPLEX)
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
#endif
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#if defined(BLIS)
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#elif defined(EIGEN)
C.noalias() += alpha_r * A * B;
#else // if defined(BLAS)
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = ( float* )bli_obj_buffer( &alpha );
float* ap = ( float* )bli_obj_buffer( &a );
float* bp = ( float* )bli_obj_buffer( &b );
float* betap = ( float* )bli_obj_buffer( &beta );
float* cp = ( float* )bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = ( double* )bli_obj_buffer( &alpha );
double* ap = ( double* )bli_obj_buffer( &a );
double* bp = ( double* )bli_obj_buffer( &b );
double* betap = ( double* )bli_obj_buffer( &beta );
double* cp = ( double* )bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#ifdef EIGEN
#define BLIS_DISABLE_BLAS_DEFS
#include "blis.h"
#include <Eigen/Core>
#include <Eigen/src/misc/blas.h>
using namespace Eigen;
#else
#include "blis.h"
#endif
#define COL_STORAGE
//#define ROW_STORAGE
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_max, p_inc;
int m_input, n_input, k_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
#if 1
p_begin = P_BEGIN;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
n_input = -1;
k_input = -1;
#else
p_begin = 40;
p_max = 1000;
p_inc = 40;
m_input = -1;
n_input = -1;
k_input = -1;
#endif
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_max; p += p_inc )
for ( p = p_max; p_begin <= p; p -= p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
#ifdef COL_STORAGE
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
#else
bli_obj_create( dt, m, k, k, 1, &a );
bli_obj_create( dt, k, n, n, 1, &b );
bli_obj_create( dt, m, n, n, 1, &c );
bli_obj_create( dt, m, n, n, 1, &c_save );
#endif
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
#ifdef EIGEN
double alpha_r, alpha_i;
bli_getsc( &alpha, &alpha_r, &alpha_i );
void* ap = bli_obj_buffer_at_off( &a );
void* bp = bli_obj_buffer_at_off( &b );
void* cp = bli_obj_buffer_at_off( &c );
#ifdef COL_STORAGE
const int os_a = bli_obj_col_stride( &a );
const int os_b = bli_obj_col_stride( &b );
const int os_c = bli_obj_col_stride( &c );
#else
const int os_a = bli_obj_row_stride( &a );
const int os_b = bli_obj_row_stride( &b );
const int os_c = bli_obj_row_stride( &c );
#endif
Stride<Dynamic,1> stride_a( os_a, 1 );
Stride<Dynamic,1> stride_b( os_b, 1 );
Stride<Dynamic,1> stride_c( os_c, 1 );
#ifdef COL_STORAGE
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
#endif
#else
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
#endif
#endif
#if defined(IS_FLOAT)
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
#elif defined (IS_DOUBLE)
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
#elif defined (IS_SCOMPLEX)
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
#elif defined (IS_DCOMPLEX)
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
#endif
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#if defined(BLIS)
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#elif defined(EIGEN)
C.noalias() += alpha_r * A * B;
#else // if defined(BLAS)
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = ( float* )bli_obj_buffer( &alpha );
float* ap = ( float* )bli_obj_buffer( &a );
float* bp = ( float* )bli_obj_buffer( &b );
float* betap = ( float* )bli_obj_buffer( &beta );
float* cp = ( float* )bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = ( double* )bli_obj_buffer( &alpha );
double* ap = ( double* )bli_obj_buffer( &a );
double* bp = ( double* )bli_obj_buffer( &b );
double* betap = ( double* )bli_obj_buffer( &beta );
double* cp = ( double* )bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

View File

@@ -96,7 +96,7 @@ endif
BLAS_LIB_PATH := $(HOME)/flame/lib
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
#MKL_LIB_PATH := ${MKLROOT}/lib/intel64
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
# OpenBLAS
@@ -115,11 +115,11 @@ MKL_LIB := -L$(MKL_LIB_PATH) \
# ESSL
# Note: ESSL is named differently for SMP and/or BG
ESSL_TYPE := # This is the 32b library on POWER
#ESSL_TYPE := # This is the 32b library on POWER
#ESSL_TYPE := 6464 # This is the 64b library on POWER
#ESSL_TYPE := bg # This is the 32b single-threaded library on Blue Gene
#ESSL_TYPE := smpbg # This is the 32b multi-threaded library on Blue Gene
ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
#ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
# Accelerate
MAC_LIB := -framework Accelerate
@@ -165,22 +165,21 @@ CFLAGS += -I$(TEST_SRC_PATH)
#all: blis openblas atlas mkl
all: blis openblas mkl
blis: test_dotv_blis.x \
test_axpyv_blis.x \
test_gemv_blis.x \
test_ger_blis.x \
test_hemv_blis.x \
test_her_blis.x \
test_her2_blis.x \
test_trmv_blis.x \
test_trsv_blis.x \
\
test_gemm_blis.x \
test_hemm_blis.x \
test_herk_blis.x \
test_her2k_blis.x \
test_trmm_blis.x \
test_trsm_blis.x
blis: test_gemm_blis.x \
# test_dotv_blis.x \
# test_axpyv_blis.x \
# test_gemv_blis.x \
# test_ger_blis.x \
# test_hemv_blis.x \
# test_her_blis.x \
# test_her2_blis.x \
# test_trmv_blis.x \
# test_trsv_blis.x \
# test_hemm_blis.x \
# test_herk_blis.x \
# test_her2k_blis.x \
# test_trmm_blis.x \
# test_trsm_blis.x
openblas: \
test_dotv_openblas.x \

0
test/output_gemm_blis.m Normal file
View File

View File

@@ -5,12 +5,14 @@ out_root="output"
#out_root="output_square"
# Operations to test.
l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm hemm herk her2k trmm trsm"
test_ops="${l2_ops} ${l3_ops}"
# l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm"
# "hemm herk her2k trmm trsm"
test_ops=" ${l3_ops}"
# "${l2_ops}"
# Implementations to test
test_impls="openblas atlas mkl blis"
# Implementations to test | "openblas atlas mkl"
test_impls="blis"
for im in ${test_impls}; do
@@ -22,7 +24,7 @@ for im in ${test_impls}; do
# Construct the name of the output file.
out_file="${out_root}_${op}_${im}.m"
echo "Running ${exec_name} > ${out_file}"
echo " Running ${exec_name} > ${out_file} "
# Run executable.
./${exec_name} > ${out_file}

View File

@@ -8,8 +8,8 @@
# accepted values.
#
1 # Number of repeats per experiment (best result is reported)
rc # Matrix storage scheme(s) to test:
3 # Number of repeats per experiment (best result is reported)
c # Matrix storage scheme(s) to test:
# 'c' = col-major storage; 'g' = general stride storage;
# 'r' = row-major storage
cj # Vector storage scheme(s) to test:
@@ -22,14 +22,14 @@ cj # Vector storage scheme(s) to test:
# '0' = real values on [-1,1];
# '1' = powers of 2 in narrow precision range
32 # General stride spacing (for cases when testing general stride)
sdcz # Datatype(s) to test:
d # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
0 # Test gemm with mixed-domain operands?
0 # Test gemm with mixed-precision operands?
100 # Problem size: first to test
500 # Problem size: maximum to test
100 # Problem size: increment between experiments
2000 # Problem size: first to test
2000 # Problem size: maximum to test
200 # Problem size: increment between experiments
# Complex level-3 implementations to test:
0 # 3mh ('1' = enable; '0' = disable)
0 # 3m1 ('1' = enable; '0' = disable)
@@ -45,5 +45,5 @@ sdcz # Datatype(s) to test:
# '0' = disable error checking; '1' = full error checking
i # Reaction to test failure:
# 'i' = ignore; 's' = sleep() and continue; 'a' = abort
0 # Output results in matlab/octave format? ('1' = yes; '0' = no)
1 # Output results in matlab/octave format? ('1' = yes; '0' = no)
0 # Output results to stdout AND files? ('1' = yes; '0' = no)

View File

@@ -276,9 +276,9 @@
# --- Level-3 --------------------------------------------------------------
1 # gemm
-1 -1 -1 # dimensions: m n k
?? # parameters: transa transb
2 # gemm
-1 -1 -1 # dimensions: m n k
nn # parameters: transa transb
1 # hemm
-1 -1 # dimensions: m n

View File

@@ -0,0 +1,106 @@
configure: detected Linux kernel version 4.14.0-115.6.1.el7a.ppc64le.
configure: python interpeter search list is: python python3 python2.
configure: using 'python' python interpreter.
configure: found python version 2.7.5 (maj: 2, min: 7, rev: 5).
configure: python 2.7.5 appears to be supported.
configure: C compiler search list is: gcc clang cc.
configure: using 'gcc' C compiler.
configure: C++ compiler search list is: g++ clang++ c++.
configure: using 'g++' C++ compiler (for sandbox only).
configure: found gcc version 8.2.0 (maj: 8, min: 2, rev: 0).
configure: checking for blacklisted configurations due to gcc 8.2.0.
configure: found assembler ('as') version 2.27 (maj: 2, min: 27, rev: ).
configure: checking for blacklisted configurations due to as 2.27.
configure: warning: assembler ('as' 2.27) does not support 'bulldozer'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'sandybridge'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'haswell'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'piledriver'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'steamroller'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'excavator'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'skx'; adding to blacklist.
configure: warning: assembler ('as' 2.27) does not support 'knl'; adding to blacklist.
configure: configuration blacklist:
configure: bulldozer sandybridge haswell piledriver steamroller excavator skx knl
configure: reading configuration registry...done.
configure: determining default version string.
configure: found '.git' directory; assuming git clone.
configure: executing: git describe --tags.
configure: git returned an error: 'Unknown option: -C
usage: git [--version] [--help] [-c name=value]
[--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
[-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
[--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
<command> [<args>]'.
configure: using string from unmodified version file.
configure: starting configuration of BLIS 0.6.0.
configure: configuring with official version string.
configure: found shared library .so version '2.0.0'.
configure: .so major version: 2
configure: .so minor.build version: 0.0
configure: manual configuration requested; configuring with 'power9'.
configure: checking configuration against contents of 'config_registry'.
configure: configuration 'power9' is registered.
configure: 'power9' is defined as having the following sub-configurations:
configure: power9
configure: which collectively require the following kernels:
configure: power9
configure: checking sub-configurations:
configure: 'power9' is registered...and exists.
configure: checking sub-configurations' requisite kernels:
configure: 'power9' kernels...exist.
configure: no install prefix option given; defaulting to '/usr/local'.
configure: no install exec_prefix option given; defaulting to PREFIX.
configure: no install libdir option given; defaulting to EXECPREFIX/lib.
configure: no install includedir option given; defaulting to PREFIX/include.
configure: no install sharedir option given; defaulting to PREFIX/share.
configure: final installation directories:
configure: prefix: /usr/local
configure: exec_prefix: ${prefix}
configure: libdir: ${exec_prefix}/lib
configure: includedir: ${prefix}/include
configure: sharedir: ${prefix}/share
configure: NOTE: the variables above can be overridden when running make.
configure: no preset CFLAGS detected.
configure: no preset LDFLAGS detected.
configure: debug symbols disabled.
configure: disabling verbose make output. (enable with 'make V=1'.)
configure: disabling ARG_MAX hack.
configure: building BLIS as both static and shared libraries.
configure: exporting only public symbols within shared library.
configure: threading is disabled.
configure: requesting slab threading in jr and ir loops.
configure: internal memory pools for packing blocks are enabled.
configure: internal memory pools for small blocks are enabled.
configure: memory tracing output is disabled.
configure: libmemkind not found; disabling.
configure: compiler appears to not support #pragma omp simd.
configure: the BLAS compatibility layer is enabled.
configure: the CBLAS compatibility layer is disabled.
configure: mixed datatype support is enabled.
configure: mixed datatype optimizations requiring extra memory are enabled.
configure: small matrix handling is enabled.
configure: the BLIS API integer size is automatically determined.
configure: the BLAS/CBLAS API integer size is 32-bit.
configure: configuring for conventional gemm implementation.
configure: creating ./config.mk from ./build/config.mk.in
configure: creating ./bli_config.h from ./build/bli_config.h.in
configure: creating ./obj/power9
configure: creating ./obj/power9/config/power9
configure: creating ./obj/power9/kernels/power9
configure: creating ./obj/power9/ref_kernels/power9
configure: creating ./obj/power9/frame
configure: creating ./obj/power9/blastest
configure: creating ./obj/power9/testsuite
configure: creating ./lib/power9
configure: creating ./include/power9
configure: mirroring ./config/power9 to ./obj/power9/config/power9
configure: mirroring ./kernels/power9 to ./obj/power9/kernels/power9
configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels
configure: mirroring ./ref_kernels to ./obj/power9/ref_kernels/power9
configure: mirroring ./frame to ./obj/power9/frame
configure: creating makefile fragments in ./obj/power9/config/power9
configure: creating makefile fragments in ./obj/power9/kernels/power9
configure: creating makefile fragments in ./obj/power9/ref_kernels
configure: creating makefile fragments in ./obj/power9/frame
configure: configured to build within top-level directory of source distribution.
CONFIGURE DONE

5
testsuite/jobscripts/cfig.sh Executable file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
cd ~/blis
./configure power9
echo "CONFIGURE DONE"

View File

@@ -0,0 +1,26 @@
#!/bin/bash
# execute in the general partition
#SBATCH --partition=general
# execute with 40 processes/tasks
#SBATCH --ntasks=1
# maximum time is 30 minutes
#SBATCH --time=00:30:00
# job name is my_job
#SBATCH --job-name=blis
# send email for status updates
#SBATCH --mail-type=ALL,TIME_LIMIT
#SBATCH --mail-user=ntukanov
# change default output file name
#SBATCH --output=cfig.out
# load environment
module load gcc/8.2
# application execution
srun cfig.sh

View File

@@ -0,0 +1,26 @@
#!/bin/bash
# execute in the general partition
#SBATCH --partition=general
# execute with 40 processes/tasks
#SBATCH --ntasks=1
# maximum time is 30 minutes
#SBATCH --time=00:30:00
# job name is my_job
#SBATCH --job-name=blis
# send email for status updates
#SBATCH --mail-type=ALL,TIME_LIMIT
#SBATCH --mail-user=ntukanov
# change default output file name
#SBATCH --output=mk.out
# load environment
module load gcc/8.2
# application execution
srun mk.sh

View File

@@ -0,0 +1,26 @@
#!/bin/bash
# execute in the general partition
#SBATCH --partition=general
# execute with 40 processes/tasks
#SBATCH --ntasks=1
# maximum time is 30 minutes
#SBATCH --time=00:30:00
# job name is my_job
#SBATCH --job-name=blis
# send email for status updates
#SBATCH --mail-type=ALL,TIME_LIMIT
#SBATCH --mail-user=ntukanov
# change default output file name
#SBATCH --output=runtest.out
# load environment
module load gcc/8.2
# application execution
srun runtest.sh

View File

@@ -0,0 +1,9 @@
Removing flattened header files from include/power9
Removing object files from ./obj/power9
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: got SIGCONT
slurmstepd: error: *** JOB 1155 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 ***
srun: forcing job termination
slurmstepd: error: *** STEP 1155.0 ON lookout00 CANCELLED AT 2019-06-10T17:29:07 ***
make: *** [cleanlib] Terminated
srun: error: lookout00: task 0: Terminated

6
testsuite/jobscripts/mk.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
cd ~/blis
make clean
make
echo "MAKE DONE"

View File

@@ -0,0 +1,8 @@
#!/bin/bash
cd ~/blis/testsuite
rm -rf test_libblis.out
make clean
make -j
./test_libblis.x > test_libblis.out
echo "TEST DONE"

View File

@@ -272,11 +272,23 @@ void libblis_test_gemm_experiment
{
bli_copym( &c_save, &c );
#if 0
bli_printm( "alpha", &alpha, "%5.2f", "" );
bli_printm( "beta", &beta, "%5.2f", "" );
bli_printm( "a = [", &a, "%7.6f", "];" );
bli_printm( "b = [", &b, "%7.6f", "];" );
bli_printm( "c = [", &c, "%7.6f", "];" );
#endif
time = bli_clock();
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
time_min = bli_clock_min_diff( time_min, time );
#if 0
bli_printm( "c_after = [", &c, "%7.6f", "];" );
#endif
}
// Estimate the performance of the best experiment repeat.
@@ -405,7 +417,6 @@ void libblis_test_gemm_md
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
time_min = bli_clock_min_diff( time_min, time );
}
// Estimate the performance of the best experiment repeat.
@@ -442,20 +453,18 @@ void libblis_test_gemm_impl
{
case BLIS_TEST_SEQ_FRONT_END:
#if 0
//bli_printm( "alpha", alpha, "%5.2f", "" );
//bli_printm( "beta", beta, "%5.2f", "" );
bli_printm( "a", a, "%5.2f", "" );
bli_printm( "b", b, "%5.2f", "" );
bli_printm( "c", c, "%5.2f", "" );
bli_printm( "alpha", alpha, "%5.2f", "" );
bli_printm( "beta", beta, "%5.2f", "" );
bli_printm( "a", a, "%6.3f", "" );
bli_printm( "b", b, "%6.3f", "" );
bli_printm( "c", c, "%6.3f", "" );
#endif
//if ( bli_obj_length( b ) == 16 &&
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
//bli_printm( "c before", c, "%6.3f", "" );
bli_gemm( alpha, a, b, beta, c );
#if 0
if ( bli_obj_length( c ) == 12 &&
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
bli_printm( "c after", c, "%6.3f", "" );
bli_printm( "c after", c, "%6.3f", "");
#endif
break;