mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Reverted minor temp/wspace changes from b426f9e.
Details: - Added missing license header to bli_pwr9_asm_macros_12x6.h. - Reverted temporary changes to various files in 'test' and 'testsuite' directories. - Moved testsuite/jobscripts into testsuite/old. - Minor whitespace/comment changes across various files.
This commit is contained in:
@@ -49,40 +49,40 @@ void bli_cntx_init_power9( cntx_t* cntx )
|
||||
bli_cntx_init_power9_ref( cntx );
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
1,
|
||||
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
|
||||
cntx
|
||||
);
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
1,
|
||||
//BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_power9_ref, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized packm kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
|
||||
cntx
|
||||
);
|
||||
// Update the context with optimized packm kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
1,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
|
||||
-1, 12, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
|
||||
-1, 12, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
|
||||
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ cortexa15: cortexa15/armv7a
|
||||
cortexa9: cortexa9/armv7a
|
||||
|
||||
# IBM architectures.
|
||||
power9: power9
|
||||
power9: power9
|
||||
bgq: bgq
|
||||
|
||||
# Generic architectures.
|
||||
|
||||
2
configure
vendored
2
configure
vendored
@@ -1430,7 +1430,7 @@ check_compiler()
|
||||
# Thus, this "blacklistcc_add" statement has been moved above.
|
||||
#blacklistcc_add "zen"
|
||||
blacklistcc_add "skx"
|
||||
# GCC-5 may support POWER9 but it is unverified.
|
||||
# gcc 5.x may support POWER9 but it is unverified.
|
||||
blacklistcc_add "power9"
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -66,7 +66,6 @@ void bli_gemm_front
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
@@ -83,7 +82,6 @@ void bli_gemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
cntx_t cntx_local;
|
||||
|
||||
@@ -150,7 +148,6 @@ void bli_gemm_front
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
@@ -278,7 +275,6 @@ void bli_gemm_front
|
||||
cntl
|
||||
);
|
||||
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If we created a temporary matrix conformal to C for whatever reason,
|
||||
|
||||
@@ -167,7 +167,7 @@ void bli_gemm_ker_var2
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
|
||||
@@ -142,7 +142,7 @@ void bli_arch_set_id( void )
|
||||
|
||||
// IBM microarchitectures.
|
||||
#ifdef BLIS_FAMILY_POWER9
|
||||
id = BLIS_ARCH_POWER9;
|
||||
id = BLIS_ARCH_POWER9;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_POWER7
|
||||
id = BLIS_ARCH_POWER7;
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
@@ -16,6 +19,7 @@
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
@@ -27,6 +31,7 @@
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_CNTX_H
|
||||
@@ -40,22 +45,28 @@ typedef struct cntx_s
|
||||
{
|
||||
blksz_t* blkszs;
|
||||
bszid_t* bmults;
|
||||
|
||||
func_t* l3_vir_ukrs;
|
||||
func_t* l3_nat_ukrs;
|
||||
mbool_t* l3_nat_ukrs_prefs;
|
||||
|
||||
blksz_t* l3_sup_thresh;
|
||||
void** l3_sup_handlers;
|
||||
blksz_t* l3_sup_blkszs;
|
||||
func_t* l3_sup_kers;
|
||||
mbool_t* l3_sup_kers_prefs;
|
||||
|
||||
func_t* l1f_kers;
|
||||
func_t* l1v_kers;
|
||||
|
||||
func_t* packm_kers;
|
||||
func_t* unpackm_kers;
|
||||
|
||||
ind_t method;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
} cntx_t;
|
||||
*/
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ CNTX_INIT_PROTS( cortexa15 )
|
||||
CNTX_INIT_PROTS( cortexa9 )
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
// -- IBM Power --
|
||||
|
||||
#ifdef BLIS_CONFIG_POWER9
|
||||
CNTX_INIT_PROTS( power9 )
|
||||
@@ -107,6 +107,9 @@ CNTX_INIT_PROTS( power9 )
|
||||
#ifdef BLIS_CONFIG_POWER7
|
||||
CNTX_INIT_PROTS( power7 )
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
|
||||
#ifdef BLIS_CONFIG_BGQ
|
||||
CNTX_INIT_PROTS( bgq )
|
||||
#endif
|
||||
@@ -193,7 +196,7 @@ CNTX_INIT_PROTS( generic )
|
||||
#include "bli_family_cortexa9.h"
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
// -- IBM Power --
|
||||
|
||||
#ifdef BLIS_FAMILY_POWER9
|
||||
#include "bli_family_power9.h"
|
||||
@@ -201,6 +204,9 @@ CNTX_INIT_PROTS( generic )
|
||||
#ifdef BLIS_FAMILY_POWER7
|
||||
#include "bli_family_power7.h"
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
|
||||
#ifdef BLIS_FAMILY_BGQ
|
||||
#include "bli_family_bgq.h"
|
||||
#endif
|
||||
@@ -266,7 +272,7 @@ CNTX_INIT_PROTS( generic )
|
||||
#include "bli_kernels_armv7a.h"
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
// -- IBM Power --
|
||||
|
||||
#ifdef BLIS_KERNELS_POWER9
|
||||
#include "bli_kernels_power9.h"
|
||||
@@ -274,6 +280,9 @@ CNTX_INIT_PROTS( generic )
|
||||
#ifdef BLIS_KERNELS_POWER7
|
||||
#include "bli_kernels_power7.h"
|
||||
#endif
|
||||
|
||||
// -- IBM BG/Q --
|
||||
|
||||
#ifdef BLIS_KERNELS_BGQ
|
||||
#include "bli_kernels_bgq.h"
|
||||
#endif
|
||||
|
||||
@@ -56,7 +56,7 @@ GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 )
|
||||
|
||||
|
||||
// gemm (asm d8x6)
|
||||
// gemm (asm d8x6)
|
||||
//GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 )
|
||||
//GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 )
|
||||
//GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 )
|
||||
|
||||
@@ -1,3 +1,36 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// MACROS for power9_asm_d12x6
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -44,4 +44,4 @@ GEMM_UKR_PROT( double, d, gemm_power9_asm_18x4 )
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_16x4 )
|
||||
|
||||
// gemm (asm d4x16)
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )
|
||||
GEMM_UKR_PROT( double, d, gemm_power9_asm_4x16 )
|
||||
|
||||
@@ -130,9 +130,9 @@ VENDORP_LIB := $(MKLP_LIB)
|
||||
#
|
||||
|
||||
# Single core (single-threaded)
|
||||
PS_BEGIN := 100
|
||||
PS_MAX := 1000
|
||||
PS_INC := 100
|
||||
PS_BEGIN := 48
|
||||
PS_MAX := 2400
|
||||
PS_INC := 48
|
||||
|
||||
# Single-socket (multithreaded)
|
||||
P1_BEGIN := 96
|
||||
@@ -242,8 +242,8 @@ blis-2s: blis-nat-2s
|
||||
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
|
||||
|
||||
# Define the datatypes, operations, and implementations.
|
||||
DTS := d # s d c z
|
||||
OPS := gemm # hemm herk trmm trsm
|
||||
DTS := s d c z
|
||||
OPS := gemm hemm herk trmm trsm
|
||||
BIMPLS := asm_blis openblas vendor
|
||||
EIMPLS := eigen
|
||||
|
||||
|
||||
@@ -1,464 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Field G. Van Zee
|
||||
#
|
||||
# Makefile for standalone BLIS test drivers.
|
||||
#
|
||||
|
||||
#
|
||||
# --- Makefile PHONY target definitions ----------------------------------------
|
||||
#
|
||||
|
||||
.PHONY: all \
|
||||
clean cleanx
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Determine makefile fragment location -------------------------------------
|
||||
#
|
||||
|
||||
# Comments:
|
||||
# - DIST_PATH is assumed to not exist if BLIS_INSTALL_PATH is given.
|
||||
# - We must use recursively expanded assignment for LIB_PATH and INC_PATH in
|
||||
# the second case because CONFIG_NAME is not yet set.
|
||||
ifneq ($(strip $(BLIS_INSTALL_PATH)),)
|
||||
LIB_PATH := $(BLIS_INSTALL_PATH)/lib
|
||||
INC_PATH := $(BLIS_INSTALL_PATH)/include/blis
|
||||
SHARE_PATH := $(BLIS_INSTALL_PATH)/share/blis
|
||||
else
|
||||
DIST_PATH := ../..
|
||||
LIB_PATH = ../../lib/$(CONFIG_NAME)
|
||||
INC_PATH = ../../include/$(CONFIG_NAME)
|
||||
SHARE_PATH := ../..
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include common makefile definitions --------------------------------------
|
||||
#
|
||||
|
||||
# Include the common makefile fragment.
|
||||
-include $(SHARE_PATH)/common.mk
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- BLAS implementations -----------------------------------------------------
|
||||
#
|
||||
|
||||
# BLAS library path(s). This is where the BLAS libraries reside.
|
||||
HOME_LIB_PATH := $(HOME)/flame/lib
|
||||
#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
|
||||
|
||||
# OpenBLAS
|
||||
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
|
||||
# OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
|
||||
|
||||
# ATLAS
|
||||
#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
|
||||
# $(HOME_LIB_PATH)/libatlas.a
|
||||
|
||||
# Eigen
|
||||
EIGEN_INC := $(HOME)/flame/eigen/include/eigen3
|
||||
EIGEN_LIB := $(HOME_LIB_PATH)/libeigen_blas_static.a
|
||||
EIGENP_LIB := $(EIGEN_LIB)
|
||||
|
||||
# MKL
|
||||
MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_intel_lp64 \
|
||||
-lmkl_core \
|
||||
-lmkl_sequential \
|
||||
-lpthread -lm -ldl
|
||||
#MKLP_LIB := -L$(MKL_LIB_PATH) \
|
||||
# -lmkl_intel_thread \
|
||||
# -lmkl_core \
|
||||
# -lmkl_intel_ilp64 \
|
||||
# -L$(ICC_LIB_PATH) \
|
||||
# -liomp5
|
||||
# MKLP_LIB := -L$(MKL_LIB_PATH) \
|
||||
# -lmkl_intel_lp64 \
|
||||
# -lmkl_core \
|
||||
# -lmkl_gnu_thread \
|
||||
# -lpthread -lm -ldl -fopenmp
|
||||
# #-L$(ICC_LIB_PATH) \
|
||||
# #-lgomp
|
||||
|
||||
VENDOR_LIB := $(MKL_LIB)
|
||||
VENDORP_LIB := $(MKLP_LIB)
|
||||
|
||||
|
||||
#
|
||||
# --- Problem size definitions -------------------------------------------------
|
||||
#
|
||||
|
||||
# Single core (single-threaded)
|
||||
PS_BEGIN := 100
|
||||
PS_MAX := 1000
|
||||
PS_INC := 100
|
||||
|
||||
# Single-socket (multithreaded)
|
||||
P1_BEGIN := 120
|
||||
P1_MAX := 6000
|
||||
P1_INC := 120
|
||||
|
||||
# Dual-socket (multithreaded)
|
||||
P2_BEGIN := 160
|
||||
P2_MAX := 8000
|
||||
P2_INC := 160
|
||||
|
||||
|
||||
#
|
||||
# --- General build definitions ------------------------------------------------
|
||||
#
|
||||
|
||||
TEST_SRC_PATH := .
|
||||
TEST_OBJ_PATH := .
|
||||
|
||||
# Gather all local object files.
|
||||
TEST_OBJS := $(sort $(patsubst $(TEST_SRC_PATH)/%.c, \
|
||||
$(TEST_OBJ_PATH)/%.o, \
|
||||
$(wildcard $(TEST_SRC_PATH)/*.c)))
|
||||
|
||||
# Override the value of CINCFLAGS so that the value of CFLAGS returned by
|
||||
# get-user-cflags-for() is not cluttered up with include paths needed only
|
||||
# while building BLIS.
|
||||
CINCFLAGS := -I$(INC_PATH)
|
||||
|
||||
# Use the "framework" CFLAGS for the configuration family.
|
||||
CFLAGS := $(call get-user-cflags-for,$(CONFIG_NAME))
|
||||
|
||||
# Add local header paths to CFLAGS.
|
||||
CFLAGS += -I$(TEST_SRC_PATH)
|
||||
|
||||
# Locate the libblis library to which we will link.
|
||||
#LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
|
||||
|
||||
# Define a set of CFLAGS for use with C++ and Eigen.
|
||||
CXXFLAGS := $(subst -std=c99,-std=c++11,$(CFLAGS))
|
||||
CXXFLAGS += -I$(EIGEN_INC)
|
||||
|
||||
# Create a copy of CXXFLAGS without -fopenmp in order to disable multithreading.
|
||||
CXXFLAGS_ST := -march=native $(subst -fopenmp,,$(CXXFLAGS))
|
||||
CXXFLAGS_MT := -march=native $(CXXFLAGS)
|
||||
|
||||
|
||||
# Which library?
|
||||
BLI_DEF := -DBLIS
|
||||
BLA_DEF := -DBLAS
|
||||
EIG_DEF := -DEIGEN
|
||||
|
||||
# Complex implementation type
|
||||
D3MHW := -DIND=BLIS_3MH
|
||||
D3M1 := -DIND=BLIS_3M1
|
||||
D4MHW := -DIND=BLIS_4MH
|
||||
D4M1B := -DIND=BLIS_4M1B
|
||||
D4M1A := -DIND=BLIS_4M1A
|
||||
D1M := -DIND=BLIS_1M
|
||||
DNAT := -DIND=BLIS_NAT
|
||||
|
||||
# Implementation string
|
||||
#STR_3MHW := -DSTR=\"3mhw\"
|
||||
#STR_3M1 := -DSTR=\"3m1\"
|
||||
#STR_4MHW := -DSTR=\"4mhw\"
|
||||
#STR_4M1B := -DSTR=\"4m1b\"
|
||||
#STR_4M1A := -DSTR=\"4m1a\"
|
||||
#STR_1M := -DSTR=\"1m\"
|
||||
STR_NAT := -DSTR=\"asm_blis\"
|
||||
STR_OBL := -DSTR=\"openblas\"
|
||||
STR_EIG := -DSTR=\"eigen\"
|
||||
STR_VEN := -DSTR=\"vendor\"
|
||||
|
||||
# Single or multithreaded string
|
||||
STR_ST := -DTHR_STR=\"st\"
|
||||
STR_1S := -DTHR_STR=\"1s\"
|
||||
STR_2S := -DTHR_STR=\"2s\"
|
||||
|
||||
# Problem size specification
|
||||
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX)
|
||||
PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
|
||||
PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
all: all-st all-1s all-2s
|
||||
blis: blis-st blis-1s blis-2s
|
||||
openblas: openblas-st openblas-1s openblas-2s
|
||||
eigen: eigen-st eigen-1s eigen-2s
|
||||
vendor: vendor-st vendor-1s vendor-2s
|
||||
mkl: vendor
|
||||
armpl: vendor
|
||||
|
||||
all-st: blis-st openblas-st mkl-st
|
||||
all-1s: blis-1s openblas-1s mkl-1s
|
||||
all-2s: blis-2s openblas-2s mkl-2s
|
||||
|
||||
blis-st: blis-nat-st
|
||||
blis-1s: blis-nat-1s
|
||||
blis-2s: blis-nat-2s
|
||||
|
||||
#blis-ind: blis-ind-st blis-ind-mt
|
||||
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
|
||||
|
||||
# Define the datatypes, operations, and implementations.
|
||||
DTS := d #s d c z
|
||||
OPS := gemm #hemm herk trmm trsm
|
||||
IMPLS := asm_blis openblas vendor
|
||||
|
||||
# Define functions to construct object filenames from the datatypes and
|
||||
# operations given an implementation. We define one function for single-
|
||||
# threaded, single-socket, and dual-socket filenames.
|
||||
get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
|
||||
get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
|
||||
get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
|
||||
|
||||
# Construct object and binary names for single-threaded, single-socket, and
|
||||
# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
|
||||
BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
|
||||
BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
|
||||
BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
|
||||
BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
|
||||
BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
|
||||
BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
|
||||
|
||||
OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
|
||||
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
|
||||
OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
|
||||
OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
|
||||
OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
|
||||
OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
|
||||
|
||||
EIGEN_ST_OBJS := $(call get-st-objs,eigen)
|
||||
EIGEN_ST_BINS := $(patsubst %.o,%.x,$(EIGEN_ST_OBJS))
|
||||
EIGEN_1S_OBJS := $(call get-1s-objs,eigen)
|
||||
EIGEN_1S_BINS := $(patsubst %.o,%.x,$(EIGEN_1S_OBJS))
|
||||
EIGEN_2S_OBJS := $(call get-2s-objs,eigen)
|
||||
EIGEN_2S_BINS := $(patsubst %.o,%.x,$(EIGEN_2S_OBJS))
|
||||
|
||||
VENDOR_ST_OBJS := $(call get-st-objs,vendor)
|
||||
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
|
||||
VENDOR_1S_OBJS := $(call get-1s-objs,vendor)
|
||||
VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
|
||||
VENDOR_2S_OBJS := $(call get-2s-objs,vendor)
|
||||
VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
|
||||
|
||||
# Define some targets associated with the above object/binary files.
|
||||
blis-nat-st: $(BLIS_NAT_ST_BINS)
|
||||
blis-nat-1s: $(BLIS_NAT_1S_BINS)
|
||||
blis-nat-2s: $(BLIS_NAT_2S_BINS)
|
||||
|
||||
openblas-st: $(OPENBLAS_ST_BINS)
|
||||
openblas-1s: $(OPENBLAS_1S_BINS)
|
||||
openblas-2s: $(OPENBLAS_2S_BINS)
|
||||
|
||||
eigen-st: $(EIGEN_ST_BINS)
|
||||
eigen-1s: $(EIGEN_1S_BINS)
|
||||
eigen-2s: $(EIGEN_2S_BINS)
|
||||
|
||||
vendor-st: $(VENDOR_ST_BINS)
|
||||
vendor-1s: $(VENDOR_1S_BINS)
|
||||
vendor-2s: $(VENDOR_2S_BINS)
|
||||
|
||||
mkl-st: vendor-st
|
||||
mkl-1s: vendor-1s
|
||||
mkl-2s: vendor-2s
|
||||
|
||||
armpl-st: vendor-st
|
||||
armpl-1s: vendor-1s
|
||||
armpl-2s: vendor-2s
|
||||
|
||||
# Mark the object files as intermediate so that make will remove them
|
||||
# automatically after building the binaries on which they depend.
|
||||
.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(BLIS_NAT_1S_OBJS) $(BLIS_NAT_2S_OBJS)
|
||||
.INTERMEDIATE: $(OPENBLAS_ST_OBJS) $(OPENBLAS_1S_OBJS) $(OPENBLAS_2S_OBJS)
|
||||
.INTERMEDIATE: $(EIGEN_ST_OBJS) $(EIGEN_1S_OBJS) $(EIGEN_2S_OBJS)
|
||||
.INTERMEDIATE: $(VENDOR_ST_OBJS) $(VENDOR_1S_OBJS) $(VENDOR_2S_OBJS)
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
|
||||
#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
|
||||
# $(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# A function to return the datatype cpp macro def from the datatype
|
||||
# character.
|
||||
get-dt-cpp = $(strip \
|
||||
$(if $(findstring s,$(1)),-DDT=BLIS_FLOAT -DIS_FLOAT,\
|
||||
$(if $(findstring d,$(1)),-DDT=BLIS_DOUBLE -DIS_DOUBLE,\
|
||||
$(if $(findstring c,$(1)),-DDT=BLIS_SCOMPLEX -DIS_SCOMPLEX,\
|
||||
-DDT=BLIS_DCOMPLEX -DIS_DCOMPLEX))))
|
||||
|
||||
# A function to return other cpp macros that help the test driver
|
||||
# identify the implementation.
|
||||
#get-bl-cpp = $(strip \
|
||||
# $(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
|
||||
# $(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
|
||||
# $(if $(findstring eigen,$(1)),$(STR_EIG) $(EIG_DEF),\
|
||||
# $(STR_VEN) $(BLA_DEF)))))
|
||||
|
||||
get-bl-cpp = $(strip \
|
||||
$(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
|
||||
$(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
|
||||
$(if $(and $(findstring eigen,$(1)),\
|
||||
$(findstring gemm,$(2))),\
|
||||
$(STR_EIG) $(EIG_DEF),\
|
||||
$(if $(findstring eigen,$(1)),\
|
||||
$(STR_EIG) $(BLA_DEF),\
|
||||
$(STR_VEN) $(BLA_DEF))))))
|
||||
|
||||
|
||||
# Rules for BLIS and BLAS libraries.
|
||||
define make-st-rule
|
||||
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-1s-rule
|
||||
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-2s-rule
|
||||
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(BIMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
# Rules for Eigen.
|
||||
define make-eigst-rule
|
||||
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_ST) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_ST) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-eig1s-rule
|
||||
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_1S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
define make-eig2s-rule
|
||||
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3),$(2)) $(DNAT) $(STR_2S) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eigst-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eig1s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach op,$(OPS), \
|
||||
$(foreach im,$(EIMPLS),$(eval $(call make-eig2s-rule,$(dt),$(op),$(im))))))
|
||||
|
||||
|
||||
# -- Executable file rules --
|
||||
|
||||
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
|
||||
# on the link command line in case BLIS was configured with the BLAS
|
||||
# compatibility layer. This prevents BLIS from inadvertently getting called
|
||||
# for the BLAS routines we are trying to test with.
|
||||
|
||||
test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_eigen_st.x: test_%_$(PS_MAX)_eigen_st.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGEN_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_eigen_1s.x: test_%_$(P1_MAX)_eigen_1s.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_eigen_2s.x: test_%_$(P2_MAX)_eigen_2s.o $(LIBBLIS_LINK)
|
||||
$(CXX) $(strip $< $(EIGENP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK)
|
||||
$(CC) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
|
||||
|
||||
|
||||
# -- Clean rules --
|
||||
|
||||
clean: cleanx
|
||||
|
||||
cleanx:
|
||||
- $(RM_F) *.o *.x
|
||||
|
||||
@@ -65,15 +65,16 @@ elif [ ${sys} = "ul264" ]; then
|
||||
fi
|
||||
|
||||
# Datatypes to test.
|
||||
test_dts="d " #s z c"
|
||||
test_dts="d s z c"
|
||||
|
||||
# Operations to test.
|
||||
test_ops="gemm "#hemm herk trmm trsm"
|
||||
test_ops="gemm hemm herk trmm trsm"
|
||||
|
||||
# Implementations to test.
|
||||
#impls="all"
|
||||
#impls="other"
|
||||
impls="blis"
|
||||
#impls="other"
|
||||
#impls="eigen"
|
||||
#impls="all"
|
||||
|
||||
if [ "${impls}" = "blis" ]; then
|
||||
|
||||
|
||||
@@ -1,418 +1,418 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#ifdef EIGEN
|
||||
#define BLIS_DISABLE_BLAS_DEFS
|
||||
#include "blis.h"
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/src/misc/blas.h>
|
||||
using namespace Eigen;
|
||||
#else
|
||||
#include "blis.h"
|
||||
#endif
|
||||
|
||||
#define COL_STORAGE
|
||||
//#define ROW_STORAGE
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_max, p_inc;
|
||||
int m_input, n_input, k_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
#if 1
|
||||
p_begin = P_BEGIN;
|
||||
p_max = P_MAX;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 40;
|
||||
p_max = 1000;
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#endif
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
//for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
for ( p = p_max; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
#else
|
||||
bli_obj_create( dt, m, k, k, 1, &a );
|
||||
bli_obj_create( dt, k, n, n, 1, &b );
|
||||
bli_obj_create( dt, m, n, n, 1, &c );
|
||||
bli_obj_create( dt, m, n, n, 1, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0 //def BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN
|
||||
double alpha_r, alpha_i;
|
||||
|
||||
bli_getsc( &alpha, &alpha_r, &alpha_i );
|
||||
|
||||
void* ap = bli_obj_buffer_at_off( &a );
|
||||
void* bp = bli_obj_buffer_at_off( &b );
|
||||
void* cp = bli_obj_buffer_at_off( &c );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
const int os_a = bli_obj_col_stride( &a );
|
||||
const int os_b = bli_obj_col_stride( &b );
|
||||
const int os_c = bli_obj_col_stride( &c );
|
||||
#else
|
||||
const int os_a = bli_obj_row_stride( &a );
|
||||
const int os_b = bli_obj_row_stride( &b );
|
||||
const int os_c = bli_obj_row_stride( &c );
|
||||
#endif
|
||||
|
||||
Stride<Dynamic,1> stride_a( os_a, 1 );
|
||||
Stride<Dynamic,1> stride_b( os_b, 1 );
|
||||
Stride<Dynamic,1> stride_c( os_c, 1 );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
|
||||
#endif
|
||||
#else
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
|
||||
#endif
|
||||
#endif
|
||||
#if defined(IS_FLOAT)
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DOUBLE)
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#if defined(BLIS)
|
||||
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#elif defined(EIGEN)
|
||||
|
||||
C.noalias() += alpha_r * A * B;
|
||||
|
||||
#else // if defined(BLAS)
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = ( float* )bli_obj_buffer( &alpha );
|
||||
float* ap = ( float* )bli_obj_buffer( &a );
|
||||
float* bp = ( float* )bli_obj_buffer( &b );
|
||||
float* betap = ( float* )bli_obj_buffer( &beta );
|
||||
float* cp = ( float* )bli_obj_buffer( &c );
|
||||
|
||||
sgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = ( double* )bli_obj_buffer( &alpha );
|
||||
double* ap = ( double* )bli_obj_buffer( &a );
|
||||
double* bp = ( double* )bli_obj_buffer( &b );
|
||||
double* betap = ( double* )bli_obj_buffer( &beta );
|
||||
double* cp = ( double* )bli_obj_buffer( &c );
|
||||
|
||||
dgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
|
||||
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
|
||||
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
|
||||
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
|
||||
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
|
||||
|
||||
cgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
|
||||
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
|
||||
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
|
||||
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
|
||||
|
||||
zgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#ifdef EIGEN
|
||||
#define BLIS_DISABLE_BLAS_DEFS
|
||||
#include "blis.h"
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/src/misc/blas.h>
|
||||
using namespace Eigen;
|
||||
#else
|
||||
#include "blis.h"
|
||||
#endif
|
||||
|
||||
#define COL_STORAGE
|
||||
//#define ROW_STORAGE
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_max, p_inc;
|
||||
int m_input, n_input, k_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
#if 1
|
||||
p_begin = P_BEGIN;
|
||||
p_max = P_MAX;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 40;
|
||||
p_max = 1000;
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#endif
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
//for ( p = p_begin; p <= p_max; p += p_inc )
|
||||
for ( p = p_max; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
#else
|
||||
bli_obj_create( dt, m, k, k, 1, &a );
|
||||
bli_obj_create( dt, k, n, n, 1, &b );
|
||||
bli_obj_create( dt, m, n, n, 1, &c );
|
||||
bli_obj_create( dt, m, n, n, 1, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0 //def BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN
|
||||
double alpha_r, alpha_i;
|
||||
|
||||
bli_getsc( &alpha, &alpha_r, &alpha_i );
|
||||
|
||||
void* ap = bli_obj_buffer_at_off( &a );
|
||||
void* bp = bli_obj_buffer_at_off( &b );
|
||||
void* cp = bli_obj_buffer_at_off( &c );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
const int os_a = bli_obj_col_stride( &a );
|
||||
const int os_b = bli_obj_col_stride( &b );
|
||||
const int os_c = bli_obj_col_stride( &c );
|
||||
#else
|
||||
const int os_a = bli_obj_row_stride( &a );
|
||||
const int os_b = bli_obj_row_stride( &b );
|
||||
const int os_c = bli_obj_row_stride( &c );
|
||||
#endif
|
||||
|
||||
Stride<Dynamic,1> stride_a( os_a, 1 );
|
||||
Stride<Dynamic,1> stride_b( os_b, 1 );
|
||||
Stride<Dynamic,1> stride_c( os_c, 1 );
|
||||
|
||||
#ifdef COL_STORAGE
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcd_;
|
||||
#endif
|
||||
#else
|
||||
#if defined(IS_FLOAT)
|
||||
typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXf_;
|
||||
#elif defined (IS_DOUBLE)
|
||||
typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXd_;
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcf_;
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcd_;
|
||||
#endif
|
||||
#endif
|
||||
#if defined(IS_FLOAT)
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > A( ( float* )ap, m, k, stride_a );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > B( ( float* )bp, k, n, stride_b );
|
||||
Map<MatrixXf_, 0, Stride<Dynamic,1> > C( ( float* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DOUBLE)
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > A( ( double* )ap, m, k, stride_a );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > B( ( double* )bp, k, n, stride_b );
|
||||
Map<MatrixXd_, 0, Stride<Dynamic,1> > C( ( double* )cp, m, n, stride_c );
|
||||
#elif defined (IS_SCOMPLEX)
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > A( ( std::complex<float>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > B( ( std::complex<float>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcf_, 0, Stride<Dynamic,1> > C( ( std::complex<float>* )cp, m, n, stride_c );
|
||||
#elif defined (IS_DCOMPLEX)
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > A( ( std::complex<double>* )ap, m, k, stride_a );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > B( ( std::complex<double>* )bp, k, n, stride_b );
|
||||
Map<MatrixXcd_, 0, Stride<Dynamic,1> > C( ( std::complex<double>* )cp, m, n, stride_c );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#if defined(BLIS)
|
||||
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#elif defined(EIGEN)
|
||||
|
||||
C.noalias() += alpha_r * A * B;
|
||||
|
||||
#else // if defined(BLAS)
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = ( float* )bli_obj_buffer( &alpha );
|
||||
float* ap = ( float* )bli_obj_buffer( &a );
|
||||
float* bp = ( float* )bli_obj_buffer( &b );
|
||||
float* betap = ( float* )bli_obj_buffer( &beta );
|
||||
float* cp = ( float* )bli_obj_buffer( &c );
|
||||
|
||||
sgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = ( double* )bli_obj_buffer( &alpha );
|
||||
double* ap = ( double* )bli_obj_buffer( &a );
|
||||
double* bp = ( double* )bli_obj_buffer( &b );
|
||||
double* betap = ( double* )bli_obj_buffer( &beta );
|
||||
double* cp = ( double* )bli_obj_buffer( &c );
|
||||
|
||||
dgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = ( scomplex* )bli_obj_buffer( &alpha );
|
||||
scomplex* ap = ( scomplex* )bli_obj_buffer( &a );
|
||||
scomplex* bp = ( scomplex* )bli_obj_buffer( &b );
|
||||
scomplex* betap = ( scomplex* )bli_obj_buffer( &beta );
|
||||
scomplex* cp = ( scomplex* )bli_obj_buffer( &c );
|
||||
|
||||
cgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = ( dcomplex* )bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = ( dcomplex* )bli_obj_buffer( &a );
|
||||
dcomplex* bp = ( dcomplex* )bli_obj_buffer( &b );
|
||||
dcomplex* betap = ( dcomplex* )bli_obj_buffer( &beta );
|
||||
dcomplex* cp = ( dcomplex* )bli_obj_buffer( &c );
|
||||
|
||||
zgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ endif
|
||||
BLAS_LIB_PATH := $(HOME)/flame/lib
|
||||
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
|
||||
|
||||
# OpenBLAS
|
||||
@@ -165,21 +165,23 @@ CFLAGS += -I$(TEST_SRC_PATH)
|
||||
#all: blis openblas atlas mkl
|
||||
all: blis openblas mkl
|
||||
|
||||
blis: test_gemm_blis.x \
|
||||
# test_dotv_blis.x \
|
||||
# test_axpyv_blis.x \
|
||||
# test_gemv_blis.x \
|
||||
# test_ger_blis.x \
|
||||
# test_hemv_blis.x \
|
||||
# test_her_blis.x \
|
||||
# test_her2_blis.x \
|
||||
# test_trmv_blis.x \
|
||||
# test_trsv_blis.x \
|
||||
# test_hemm_blis.x \
|
||||
# test_herk_blis.x \
|
||||
# test_her2k_blis.x \
|
||||
# test_trmm_blis.x \
|
||||
# test_trsm_blis.x
|
||||
blis: \
|
||||
test_dotv_blis.x \
|
||||
test_axpyv_blis.x \
|
||||
test_gemv_blis.x \
|
||||
test_ger_blis.x \
|
||||
test_hemv_blis.x \
|
||||
test_her_blis.x \
|
||||
test_her2_blis.x \
|
||||
test_trmv_blis.x \
|
||||
test_trsv_blis.x \
|
||||
\
|
||||
test_gemm_blis.x \
|
||||
test_hemm_blis.x \
|
||||
test_herk_blis.x \
|
||||
test_her2k_blis.x \
|
||||
test_trmm_blis.x \
|
||||
test_trsm_blis.x
|
||||
|
||||
openblas: \
|
||||
test_dotv_openblas.x \
|
||||
|
||||
@@ -5,13 +5,12 @@ out_root="output"
|
||||
#out_root="output_square"
|
||||
|
||||
# Operations to test.
|
||||
# l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm"
|
||||
# "hemm herk her2k trmm trsm"
|
||||
test_ops=" ${l3_ops}"
|
||||
# "${l2_ops}"
|
||||
l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm hemm herk her2k trmm trsm"
|
||||
test_ops="${l2_ops} ${l3_ops}"
|
||||
|
||||
# Implementations to test | "openblas atlas mkl"
|
||||
# Implementations to test.
|
||||
#test_impls="openblas mkl blis"
|
||||
test_impls="blis"
|
||||
|
||||
for im in ${test_impls}; do
|
||||
@@ -24,7 +23,7 @@ for im in ${test_impls}; do
|
||||
# Construct the name of the output file.
|
||||
out_file="${out_root}_${op}_${im}.m"
|
||||
|
||||
echo " Running ${exec_name} > ${out_file} "
|
||||
echo "Running ${exec_name} > ${out_file}"
|
||||
|
||||
# Run executable.
|
||||
./${exec_name} > ${out_file}
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
# accepted values.
|
||||
#
|
||||
|
||||
3 # Number of repeats per experiment (best result is reported)
|
||||
c # Matrix storage scheme(s) to test:
|
||||
1 # Number of repeats per experiment (best result is reported)
|
||||
rc # Matrix storage scheme(s) to test:
|
||||
# 'c' = col-major storage; 'g' = general stride storage;
|
||||
# 'r' = row-major storage
|
||||
cj # Vector storage scheme(s) to test:
|
||||
@@ -22,14 +22,14 @@ cj # Vector storage scheme(s) to test:
|
||||
# '0' = real values on [-1,1];
|
||||
# '1' = powers of 2 in narrow precision range
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
d # Datatype(s) to test:
|
||||
sdcz # Datatype(s) to test:
|
||||
# 's' = single real; 'c' = single complex;
|
||||
# 'd' = double real; 'z' = double complex
|
||||
0 # Test gemm with mixed-domain operands?
|
||||
0 # Test gemm with mixed-precision operands?
|
||||
2000 # Problem size: first to test
|
||||
2000 # Problem size: maximum to test
|
||||
200 # Problem size: increment between experiments
|
||||
100 # Problem size: first to test
|
||||
500 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
# Complex level-3 implementations to test:
|
||||
0 # 3mh ('1' = enable; '0' = disable)
|
||||
0 # 3m1 ('1' = enable; '0' = disable)
|
||||
@@ -45,5 +45,5 @@ d # Datatype(s) to test:
|
||||
# '0' = disable error checking; '1' = full error checking
|
||||
i # Reaction to test failure:
|
||||
# 'i' = ignore; 's' = sleep() and continue; 'a' = abort
|
||||
1 # Output results in matlab/octave format? ('1' = yes; '0' = no)
|
||||
0 # Output results in matlab/octave format? ('1' = yes; '0' = no)
|
||||
0 # Output results to stdout AND files? ('1' = yes; '0' = no)
|
||||
|
||||
@@ -276,9 +276,9 @@
|
||||
|
||||
# --- Level-3 --------------------------------------------------------------
|
||||
|
||||
2 # gemm
|
||||
-1 -1 -1 # dimensions: m n k
|
||||
nn # parameters: transa transb
|
||||
1 # gemm
|
||||
-1 -1 -1 # dimensions: m n k
|
||||
?? # parameters: transa transb
|
||||
|
||||
1 # hemm
|
||||
-1 -1 # dimensions: m n
|
||||
|
||||
@@ -259,6 +259,7 @@ void libblis_test_gemm_experiment
|
||||
libblis_test_mobj_randomize( params, TRUE, &b );
|
||||
libblis_test_mobj_randomize( params, TRUE, &c );
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
//bli_setsc( 1.0, 0.0, &alpha );
|
||||
//bli_setsc( 0.0, 0.0, &beta );
|
||||
@@ -272,23 +273,11 @@ void libblis_test_gemm_experiment
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
#if 0
|
||||
bli_printm( "alpha", &alpha, "%5.2f", "" );
|
||||
bli_printm( "beta", &beta, "%5.2f", "" );
|
||||
bli_printm( "a = [", &a, "%7.6f", "];" );
|
||||
bli_printm( "b = [", &b, "%7.6f", "];" );
|
||||
bli_printm( "c = [", &c, "%7.6f", "];" );
|
||||
#endif
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
#if 0
|
||||
bli_printm( "c_after = [", &c, "%7.6f", "];" );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -417,6 +406,7 @@ void libblis_test_gemm_md
|
||||
|
||||
libblis_test_gemm_impl( iface, &alpha, &a, &b, &beta, &c );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -453,18 +443,20 @@ void libblis_test_gemm_impl
|
||||
{
|
||||
case BLIS_TEST_SEQ_FRONT_END:
|
||||
#if 0
|
||||
bli_printm( "alpha", alpha, "%5.2f", "" );
|
||||
bli_printm( "beta", beta, "%5.2f", "" );
|
||||
bli_printm( "a", a, "%6.3f", "" );
|
||||
bli_printm( "b", b, "%6.3f", "" );
|
||||
bli_printm( "c", c, "%6.3f", "" );
|
||||
//bli_printm( "alpha", alpha, "%5.2f", "" );
|
||||
//bli_printm( "beta", beta, "%5.2f", "" );
|
||||
bli_printm( "a", a, "%5.2f", "" );
|
||||
bli_printm( "b", b, "%5.2f", "" );
|
||||
bli_printm( "c", c, "%5.2f", "" );
|
||||
#endif
|
||||
//if ( bli_obj_length( b ) == 16 &&
|
||||
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
|
||||
//bli_printm( "c before", c, "%6.3f", "" );
|
||||
bli_gemm( alpha, a, b, beta, c );
|
||||
#if 0
|
||||
bli_printm( "c after", c, "%6.3f", "");
|
||||
if ( bli_obj_length( c ) == 12 &&
|
||||
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
|
||||
bli_printm( "c after", c, "%6.3f", "" );
|
||||
#endif
|
||||
break;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user