Reverted minor temp/wspace changes from b426f9e.

Details:
- Added missing license header to bli_pwr9_asm_macros_12x6.h.
- Reverted temporary changes to various files in 'test' and 'testsuite'
  directories.
- Moved testsuite/jobscripts into testsuite/old.
- Minor whitespace/comment changes across various files.
This commit is contained in:
Field G. Van Zee
2019-11-04 13:57:12 -06:00
parent 4870260f6b
commit c84391314d
29 changed files with 566 additions and 987 deletions

View File

@@ -49,40 +49,40 @@ void bli_cntx_init_power9( cntx_t* cntx )
bli_cntx_init_power9_ref( cntx );
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
cntx
);
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
cntx
);
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
-1, 12, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
-1, 12, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}

View File

@@ -39,7 +39,7 @@ cortexa15: cortexa15/armv7a
cortexa9: cortexa9/armv7a
# IBM architectures.
power9: power9
power9: power9
bgq: bgq
# Generic architectures.

2
configure vendored
View File

@@ -1430,7 +1430,7 @@ check_compiler()
# Thus, this "blacklistcc_add" statement has been moved above.
#blacklistcc_add "zen"
blacklistcc_add "skx"
# GCC-5 may support POWER9 but it is unverified.
# gcc 5.x may support POWER9 but it is unverified.
blacklistcc_add "power9"
fi
fi

View File

@@ -66,7 +66,6 @@ void bli_gemm_front
#endif
#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -83,7 +82,6 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
@@ -150,7 +148,6 @@ void bli_gemm_front
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
@@ -278,7 +275,6 @@ void bli_gemm_front
cntl
);
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If we created a temporary matrix conformal to C for whatever reason,

View File

@@ -167,7 +167,7 @@ void bli_gemm_ker_var2
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,

View File

@@ -142,7 +142,7 @@ void bli_arch_set_id( void )
// IBM microarchitectures.
#ifdef BLIS_FAMILY_POWER9
id = BLIS_ARCH_POWER9;
id = BLIS_ARCH_POWER9;
#endif
#ifdef BLIS_FAMILY_POWER7
id = BLIS_ARCH_POWER7;

View File

@@ -1,10 +1,13 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
@@ -16,6 +19,7 @@
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,6 +31,7 @@
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CNTX_H
@@ -40,22 +45,28 @@ typedef struct cntx_s
{
blksz_t* blkszs;
bszid_t* bmults;
func_t* l3_vir_ukrs;
func_t* l3_nat_ukrs;
mbool_t* l3_nat_ukrs_prefs;
blksz_t* l3_sup_thresh;
void** l3_sup_handlers;
blksz_t* l3_sup_blkszs;
func_t* l3_sup_kers;
mbool_t* l3_sup_kers_prefs;
func_t* l1f_kers;
func_t* l1v_kers;
func_t* packm_kers;
func_t* unpackm_kers;
ind_t method;
pack_t schema_a;
pack_t schema_b;
pack_t schema_c;
} cntx_t;
*/

View File

@@ -99,7 +99,7 @@ CNTX_INIT_PROTS( cortexa15 )
CNTX_INIT_PROTS( cortexa9 )
#endif
// -- IBM BG/Q --
// -- IBM Power --
#ifdef BLIS_CONFIG_POWER9
CNTX_INIT_PROTS( power9 )
@@ -107,6 +107,9 @@ CNTX_INIT_PROTS( power9 )
#ifdef BLIS_CONFIG_POWER7
CNTX_INIT_PROTS( power7 )
#endif
// -- IBM BG/Q --
#ifdef BLIS_CONFIG_BGQ
CNTX_INIT_PROTS( bgq )
#endif
@@ -193,7 +196,7 @@ CNTX_INIT_PROTS( generic )
#include "bli_family_cortexa9.h"
#endif
// -- IBM BG/Q --
// -- IBM Power --
#ifdef BLIS_FAMILY_POWER9
#include "bli_family_power9.h"
@@ -201,6 +204,9 @@ CNTX_INIT_PROTS( generic )
#ifdef BLIS_FAMILY_POWER7
#include "bli_family_power7.h"
#endif
// -- IBM BG/Q --
#ifdef BLIS_FAMILY_BGQ
#include "bli_family_bgq.h"
#endif
@@ -266,7 +272,7 @@ CNTX_INIT_PROTS( generic )
#include "bli_kernels_armv7a.h"
#endif
// -- IBM BG/Q --
// -- IBM Power --
#ifdef BLIS_KERNELS_POWER9
#include "bli_kernels_power9.h"
@@ -274,6 +280,9 @@ CNTX_INIT_PROTS( generic )
#ifdef BLIS_KERNELS_POWER7
#include "bli_kernels_power7.h"
#endif
// -- IBM BG/Q --
#ifdef BLIS_KERNELS_BGQ
#include "bli_kernels_bgq.h"
#endif

View File

@@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 )
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 )
// gemm (asm d8x6)
// gemm (asm d8x6)
//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 )
//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 )
//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )

View File

@@ -1,3 +1,36 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// MACROS for power9_asm_d12x6

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -44,4 +44,4 @@ GEMM_UKR_PROT( double, d, gemm_power9_asm_18x4 )
GEMM_UKR_PROT( double, d, gemm_power9_asm_16x4 )
// gemm (asm d4x16)
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )

View File

@@ -130,9 +130,9 @@ VENDORP_LIB := $(MKLP_LIB)
#
# Single core (single-threaded)
PS_BEGIN := 100
PS_MAX := 1000
PS_INC := 100
PS_BEGIN := 48
PS_MAX := 2400
PS_INC := 48
# Single-socket (multithreaded)
P1_BEGIN := 96
@@ -242,8 +242,8 @@ blis-2s: blis-nat-2s
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
# Define the datatypes, operations, and implementations.
DTS := d # s d c z
OPS := gemm # hemm herk trmm trsm
DTS := s d c z
OPS := gemm hemm herk trmm trsm
BIMPLS := asm_blis openblas vendor
EIMPLS := eigen

View File

@@ -1,464 +0,0 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
#
# Makefile
#
# Field G. Van Zee
#
# Makefile for standalone BLIS test drivers.
#
#
# --- Makefile PHONY target definitions ----------------------------------------
#
.PHONY: all \
clean cleanx
#
# --- Determine makefile fragment location -------------------------------------
#
# Comments:
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
# the second case because CONFIG_NAME is not yet set.
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
else
DIST_PATH := ../..
LIB_PATH = ../../lib/$(CONFIG_NAME)
INC_PATH = ../../include/$(CONFIG_NAME)
SHARE_PATH := ../..
endif
#
# --- Include common makefile definitions --------------------------------------
#
# Include the common makefile fragment.
-include $(SHARE_PATH)/common.mk
#
# --- BLAS implementations -----------------------------------------------------
#
# BLAS library path(s). This is where the BLAS libraries reside.
HOME_LIB_PATH := $(HOME)/flame/lib
#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
# OpenBLAS
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
# OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
# ATLAS
#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
# $(HOME_LIB_PATH)/libatlas.a
# Eigen
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
EIGENP_LIB := $(EIGEN_LIB)
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_sequential \
-lpthread -lm -ldl
#MKLP_LIB := -L$(MKL_LIB_PATH) \
# -lmkl_intel_thread \
# -lmkl_core \
# -lmkl_intel_ilp64 \
# -L$(ICC_LIB_PATH) \
# -liomp5
# MKLP_LIB := -L$(MKL_LIB_PATH) \
# -lmkl_intel_lp64 \
# -lmkl_core \
# -lmkl_gnu_thread \
# -lpthread -lm -ldl -fopenmp
# #-L$(ICC_LIB_PATH) \
# #-lgomp
VENDOR_LIB := $(MKL_LIB)
VENDORP_LIB := $(MKLP_LIB)
#
# --- Problem size definitions -------------------------------------------------
#
# Single core (single-threaded)
PS_BEGIN := 100
PS_MAX := 1000
PS_INC := 100
# Single-socket (multithreaded)
P1_BEGIN := 120
P1_MAX := 6000
P1_INC := 120
# Dual-socket (multithreaded)
P2_BEGIN := 160
P2_MAX := 8000
P2_INC := 160
#
# --- General build definitions ------------------------------------------------
#
TEST_SRC_PATH := .
TEST_OBJ_PATH := .
# Gather all local object files.
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c)))
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
# get-user-cflags-for() is not cluttered up with include paths needed only
# while building BLIS.
CINCFLAGS := -I$(INC_PATH)
# Use the "framework" CFLAGS for the configuration family.
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
# Add local header paths to CFLAGS.
CFLAGS += -I$(TEST_SRC_PATH)
# Locate the libblis library to which we will link.
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Define a set of CFLAGS for use with C++ and Eigen.
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
CXXFLAGS += -I$(EIGEN_INC)
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
CXXFLAGS_MT := -march=native $(CXXFLAGS)
# Which library?
BLI_DEF := -DBLIS
BLA_DEF := -DBLAS
EIG_DEF := -DEIGEN
# Complex implementation type
D3MHW := -DIND=BLIS_3MH
D3M1 := -DIND=BLIS_3M1
D4MHW := -DIND=BLIS_4MH
D4M1B := -DIND=BLIS_4M1B
D4M1A := -DIND=BLIS_4M1A
D1M := -DIND=BLIS_1M
DNAT := -DIND=BLIS_NAT
# Implementation string
#STR_3MHW := -DSTR=\"3mhw\"
#STR_3M1 := -DSTR=\"3m1\"
#STR_4MHW := -DSTR=\"4mhw\"
#STR_4M1B := -DSTR=\"4m1b\"
#STR_4M1A := -DSTR=\"4m1a\"
#STR_1M := -DSTR=\"1m\"
STR_NAT := -DSTR=\"asm_blis\"
STR_OBL := -DSTR=\"openblas\"
STR_EIG := -DSTR=\"eigen\"
STR_VEN := -DSTR=\"vendor\"
# Single or multithreaded string
STR_ST := -DTHR_STR=\"st\"
STR_1S := -DTHR_STR=\"1s\"
STR_2S := -DTHR_STR=\"2s\"
# Problem size specification
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX)
PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
#
# --- Targets/rules ------------------------------------------------------------
#
all: all-st all-1s all-2s
blis: blis-st blis-1s blis-2s
openblas: openblas-st openblas-1s openblas-2s
eigen: eigen-st eigen-1s eigen-2s
vendor: vendor-st vendor-1s vendor-2s
mkl: vendor
armpl: vendor
all-st: blis-st openblas-st mkl-st
all-1s: blis-1s openblas-1s mkl-1s
all-2s: blis-2s openblas-2s mkl-2s
blis-st: blis-nat-st
blis-1s: blis-nat-1s
blis-2s: blis-nat-2s
#blis-ind: blis-ind-st blis-ind-mt
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
# Define the datatypes, operations, and implementations.
DTS := d #s d c z
OPS := gemm #hemm herk trmm trsm
IMPLS := asm_blis openblas vendor
# Define functions to construct object filenames from the datatypes and
# operations given an implementation. We define one function for single-
# threaded, single-socket, and dual-socket filenames.
get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
# Construct object and binary names for single-threaded, single-socket, and
# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
EIGEN_ST_OBJS := $(call get-st-objs,eigen)
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
EIGEN_1S_OBJS := $(call get-1s-objs,eigen)
EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
EIGEN_2S_OBJS := $(call get-2s-objs,eigen)
EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
VENDOR_ST_OBJS := $(call get-st-objs,vendor)
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
VENDOR_1S_OBJS := $(call get-1s-objs,vendor)
VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
VENDOR_2S_OBJS := $(call get-2s-objs,vendor)
VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
# Define some targets associated with the above object/binary files.
blis-nat-st: $(BLIS_NAT_ST_BINS)
blis-nat-1s: $(BLIS_NAT_1S_BINS)
blis-nat-2s: $(BLIS_NAT_2S_BINS)
openblas-st: $(OPENBLAS_ST_BINS)
openblas-1s: $(OPENBLAS_1S_BINS)
openblas-2s: $(OPENBLAS_2S_BINS)
eigen-st: $(EIGEN_ST_BINS)
eigen-1s: $(EIGEN_1S_BINS)
eigen-2s: $(EIGEN_2S_BINS)
vendor-st: $(VENDOR_ST_BINS)
vendor-1s: $(VENDOR_1S_BINS)
vendor-2s: $(VENDOR_2S_BINS)
mkl-st: vendor-st
mkl-1s: vendor-1s
mkl-2s: vendor-2s
armpl-st: vendor-st
armpl-1s: vendor-1s
armpl-2s: vendor-2s
# Mark the object files as intermediate so that make will remove them
# automatically after building the binaries on which they depend.
.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS)
.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS)
# --Object file rules --
#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
# $(CC) $(CFLAGS) -c $< -o $@
# A function to return the datatype cpp macro def from the datatype
# character.
get-dt-cpp = $(strip \
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
# A function to return other cpp macros that help the test driver
# identify the implementation.
#get-bl-cpp = $(strip \
# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
# $(STR_VEN) $(BLA_DEF)))))
get-bl-cpp = $(strip \
$(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
$(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
$(if $(and $(findstring eigen,$(1)),\
$(findstring gemm,$(2))),\
$(STR_EIG) $(EIG_DEF),\
$(if $(findstring eigen,$(1)),\
$(STR_EIG) $(BLA_DEF),\
$(STR_VEN) $(BLA_DEF))))))
# Rules for BLIS and BLAS libraries.
define make-st-rule
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
endef
define make-1s-rule
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
endef
define make-2s-rule
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
endef
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
# Rules for Eigen.
define make-eigst-rule
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
endef
define make-eig1s-rule
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
endef
define make-eig2s-rule
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
endef
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
# -- Executable file rules --
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
# on the link command line in case BLIS was configured with the BLAS
# compatibility layer. This prevents BLIS from inadvertently getting called
# for the BLAS routines we are trying to test with.
test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK)
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK)
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
# -- Clean rules --
clean: cleanx
cleanx:
- $(RM_F) *.o *.x

View File

@@ -65,15 +65,16 @@ elif [ ${sys} = "ul264" ]; then
fi
# Datatypes to test.
test_dts="d " #s z c"
test_dts="d s z c"
# Operations to test.
test_ops="gemm "#hemm herk trmm trsm"
test_ops="gemm hemm herk trmm trsm"
# Implementations to test.
#impls="all"
#impls="other"
impls="blis"
#impls="other"
#impls="eigen"
#impls="all"
if [ "${impls}" = "blis" ]; then

View File

@@ -1,418 +1,418 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#ifdef EIGEN
#define BLIS_DISABLE_BLAS_DEFS
#include "blis.h"
#include <Eigen/Core>
#include <Eigen/src/misc/blas.h>
using namespace Eigen;
#else
#include "blis.h"
#endif
#define COL_STORAGE
//#define ROW_STORAGE
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_max, p_inc;
int m_input, n_input, k_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
#if 1
p_begin = P_BEGIN;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
n_input = -1;
k_input = -1;
#else
p_begin = 40;
p_max = 1000;
p_inc = 40;
m_input = -1;
n_input = -1;
k_input = -1;
#endif
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_max; p += p_inc )
for ( p = p_max; p_begin <= p; p -= p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
#ifdef COL_STORAGE
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
#else
bli_obj_create( dt, m, k, k, 1, &a );
bli_obj_create( dt, k, n, n, 1, &b );
bli_obj_create( dt, m, n, n, 1, &c );
bli_obj_create( dt, m, n, n, 1, &c_save );
#endif
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
#ifdef EIGEN
double alpha_r, alpha_i;
bli_getsc( &alpha, &alpha_r, &alpha_i );
void* ap = bli_obj_buffer_at_off( &a );
void* bp = bli_obj_buffer_at_off( &b );
void* cp = bli_obj_buffer_at_off( &c );
#ifdef COL_STORAGE
const int os_a = bli_obj_col_stride( &a );
const int os_b = bli_obj_col_stride( &b );
const int os_c = bli_obj_col_stride( &c );
#else
const int os_a = bli_obj_row_stride( &a );
const int os_b = bli_obj_row_stride( &b );
const int os_c = bli_obj_row_stride( &c );
#endif
Stride<Dynamic,1> stride_a( os_a, 1 );
Stride<Dynamic,1> stride_b( os_b, 1 );
Stride<Dynamic,1> stride_c( os_c, 1 );
#ifdef COL_STORAGE
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
#endif
#else
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
#endif
#endif
#if defined(IS_FLOAT)
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
#elif defined (IS_DOUBLE)
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
#elif defined (IS_SCOMPLEX)
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
#elif defined (IS_DCOMPLEX)
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
#endif
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#if defined(BLIS)
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#elif defined(EIGEN)
C.noalias() += alpha_r * A * B;
#else // if defined(BLAS)
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = ( float* )bli_obj_buffer( &alpha );
float* ap = ( float* )bli_obj_buffer( &a );
float* bp = ( float* )bli_obj_buffer( &b );
float* betap = ( float* )bli_obj_buffer( &beta );
float* cp = ( float* )bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = ( double* )bli_obj_buffer( &alpha );
double* ap = ( double* )bli_obj_buffer( &a );
double* bp = ( double* )bli_obj_buffer( &b );
double* betap = ( double* )bli_obj_buffer( &beta );
double* cp = ( double* )bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#ifdef EIGEN
#define BLIS_DISABLE_BLAS_DEFS
#include "blis.h"
#include <Eigen/Core>
#include <Eigen/src/misc/blas.h>
using namespace Eigen;
#else
#include "blis.h"
#endif
#define COL_STORAGE
//#define ROW_STORAGE
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_max, p_inc;
int m_input, n_input, k_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
trans_t transa;
trans_t transb;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
#if 1
p_begin = P_BEGIN;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
n_input = -1;
k_input = -1;
#else
p_begin = 40;
p_max = 1000;
p_inc = 40;
m_input = -1;
n_input = -1;
k_input = -1;
#endif
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_max; p += p_inc )
for ( p = p_max; p_begin <= p; p -= p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
#ifdef COL_STORAGE
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
#else
bli_obj_create( dt, m, k, k, 1, &a );
bli_obj_create( dt, k, n, n, 1, &b );
bli_obj_create( dt, m, n, n, 1, &c );
bli_obj_create( dt, m, n, n, 1, &c_save );
#endif
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
#ifdef EIGEN
double alpha_r, alpha_i;
bli_getsc( &alpha, &alpha_r, &alpha_i );
void* ap = bli_obj_buffer_at_off( &a );
void* bp = bli_obj_buffer_at_off( &b );
void* cp = bli_obj_buffer_at_off( &c );
#ifdef COL_STORAGE
const int os_a = bli_obj_col_stride( &a );
const int os_b = bli_obj_col_stride( &b );
const int os_c = bli_obj_col_stride( &c );
#else
const int os_a = bli_obj_row_stride( &a );
const int os_b = bli_obj_row_stride( &b );
const int os_c = bli_obj_row_stride( &c );
#endif
Stride<Dynamic,1> stride_a( os_a, 1 );
Stride<Dynamic,1> stride_b( os_b, 1 );
Stride<Dynamic,1> stride_c( os_c, 1 );
#ifdef COL_STORAGE
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
#endif
#else
#if defined(IS_FLOAT)
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
#elif defined (IS_DOUBLE)
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
#elif defined (IS_SCOMPLEX)
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
#elif defined (IS_DCOMPLEX)
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
#endif
#endif
#if defined(IS_FLOAT)
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
#elif defined (IS_DOUBLE)
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
#elif defined (IS_SCOMPLEX)
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
#elif defined (IS_DCOMPLEX)
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
#endif
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#if defined(BLIS)
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#elif defined(EIGEN)
C.noalias() += alpha_r * A * B;
#else // if defined(BLAS)
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = ( float* )bli_obj_buffer( &alpha );
float* ap = ( float* )bli_obj_buffer( &a );
float* bp = ( float* )bli_obj_buffer( &b );
float* betap = ( float* )bli_obj_buffer( &beta );
float* cp = ( float* )bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = ( double* )bli_obj_buffer( &alpha );
double* ap = ( double* )bli_obj_buffer( &a );
double* bp = ( double* )bli_obj_buffer( &b );
double* betap = ( double* )bli_obj_buffer( &beta );
double* cp = ( double* )bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

View File

@@ -96,7 +96,7 @@ endif
BLAS_LIB_PATH := $(HOME)/flame/lib
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
#MKL_LIB_PATH := ${MKLROOT}/lib/intel64
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
# OpenBLAS
@@ -165,21 +165,23 @@ CFLAGS += -I$(TEST_SRC_PATH)
#all: blis openblas atlas mkl
all: blis openblas mkl
blis: test_gemm_blis.x \
# test_dotv_blis.x \
# test_axpyv_blis.x \
# test_gemv_blis.x \
# test_ger_blis.x \
# test_hemv_blis.x \
# test_her_blis.x \
# test_her2_blis.x \
# test_trmv_blis.x \
# test_trsv_blis.x \
# test_hemm_blis.x \
# test_herk_blis.x \
# test_her2k_blis.x \
# test_trmm_blis.x \
# test_trsm_blis.x
blis: \
test_dotv_blis.x \
test_axpyv_blis.x \
test_gemv_blis.x \
test_ger_blis.x \
test_hemv_blis.x \
test_her_blis.x \
test_her2_blis.x \
test_trmv_blis.x \
test_trsv_blis.x \
\
test_gemm_blis.x \
test_hemm_blis.x \
test_herk_blis.x \
test_her2k_blis.x \
test_trmm_blis.x \
test_trsm_blis.x
openblas: \
test_dotv_openblas.x \

View File

@@ -5,13 +5,12 @@ out_root="output"
#out_root="output_square"
# Operations to test.
# l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm"
# "hemm herk her2k trmm trsm"
test_ops=" ${l3_ops}"
# "${l2_ops}"
l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm hemm herk her2k trmm trsm"
test_ops="${l2_ops} ${l3_ops}"
# Implementations to test | "openblas atlas mkl"
# Implementations to test.
#test_impls="openblas mkl blis"
test_impls="blis"
for im in ${test_impls}; do
@@ -24,7 +23,7 @@ for im in ${test_impls}; do
# Construct the name of the output file.
out_file="${out_root}_${op}_${im}.m"
echo " Running ${exec_name} > ${out_file} "
echo "Running ${exec_name} > ${out_file}"
# Run executable.
./${exec_name} > ${out_file}

View File

@@ -8,8 +8,8 @@
# accepted values.
#
3 # Number of repeats per experiment (best result is reported)
c # Matrix storage scheme(s) to test:
1 # Number of repeats per experiment (best result is reported)
rc # Matrix storage scheme(s) to test:
# 'c' = col-major storage; 'g' = general stride storage;
# 'r' = row-major storage
cj # Vector storage scheme(s) to test:
@@ -22,14 +22,14 @@ cj # Vector storage scheme(s) to test:
# '0' = real values on [-1,1];
# '1' = powers of 2 in narrow precision range
32 # General stride spacing (for cases when testing general stride)
d # Datatype(s) to test:
sdcz # Datatype(s) to test:
# 's' = single real; 'c' = single complex;
# 'd' = double real; 'z' = double complex
0 # Test gemm with mixed-domain operands?
0 # Test gemm with mixed-precision operands?
2000 # Problem size: first to test
2000 # Problem size: maximum to test
200 # Problem size: increment between experiments
100 # Problem size: first to test
500 # Problem size: maximum to test
100 # Problem size: increment between experiments
# Complex level-3 implementations to test:
0 # 3mh ('1' = enable; '0' = disable)
0 # 3m1 ('1' = enable; '0' = disable)
@@ -45,5 +45,5 @@ d # Datatype(s) to test:
# '0' = disable error checking; '1' = full error checking
i # Reaction to test failure:
# 'i' = ignore; 's' = sleep() and continue; 'a' = abort
1 # Output results in matlab/octave format? ('1' = yes; '0' = no)
0 # Output results in matlab/octave format? ('1' = yes; '0' = no)
0 # Output results to stdout AND files? ('1' = yes; '0' = no)

View File

@@ -276,9 +276,9 @@
# --- Level-3 --------------------------------------------------------------
2 # gemm
-1 -1 -1 # dimensions: m n k
nn # parameters: transa transb
1 # gemm
-1 -1 -1 # dimensions: m n k
?? # parameters: transa transb
1 # hemm
-1 -1 # dimensions: m n

View File

@@ -259,6 +259,7 @@ void libblis_test_gemm_experiment
libblis_test_mobj_randomize( params, TRUE, &b );
libblis_test_mobj_randomize( params, TRUE, &c );
bli_copym( &c, &c_save );
//bli_setm( &BLIS_ONE, &a );
//bli_setsc( 1.0, 0.0, &alpha );
//bli_setsc( 0.0, 0.0, &beta );
@@ -272,23 +273,11 @@ void libblis_test_gemm_experiment
{
bli_copym( &c_save, &c );
#if 0
bli_printm( "alpha", &alpha, "%5.2f", "" );
bli_printm( "beta", &beta, "%5.2f", "" );
bli_printm( "a = [", &a, "%7.6f", "];" );
bli_printm( "b = [", &b, "%7.6f", "];" );
bli_printm( "c = [", &c, "%7.6f", "];" );
#endif
time = bli_clock();
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
time_min = bli_clock_min_diff( time_min, time );
#if 0
bli_printm( "c_after = [", &c, "%7.6f", "];" );
#endif
}
// Estimate the performance of the best experiment repeat.
@@ -417,6 +406,7 @@ void libblis_test_gemm_md
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
time_min = bli_clock_min_diff( time_min, time );
}
// Estimate the performance of the best experiment repeat.
@@ -453,18 +443,20 @@ void libblis_test_gemm_impl
{
case BLIS_TEST_SEQ_FRONT_END:
#if 0
bli_printm( "alpha", alpha, "%5.2f", "" );
bli_printm( "beta", beta, "%5.2f", "" );
bli_printm( "a", a, "%6.3f", "" );
bli_printm( "b", b, "%6.3f", "" );
bli_printm( "c", c, "%6.3f", "" );
//bli_printm( "alpha", alpha, "%5.2f", "" );
//bli_printm( "beta", beta, "%5.2f", "" );
bli_printm( "a", a, "%5.2f", "" );
bli_printm( "b", b, "%5.2f", "" );
bli_printm( "c", c, "%5.2f", "" );
#endif
//if ( bli_obj_length( b ) == 16 &&
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
//bli_printm( "c before", c, "%6.3f", "" );
bli_gemm( alpha, a, b, beta, c );
#if 0
bli_printm( "c after", c, "%6.3f", "");
if ( bli_obj_length( c ) == 12 &&
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
bli_printm( "c after", c, "%6.3f", "" );
#endif
break;