Squash-merge 'pr' into 'squash'. (#457)

Merged contributions from AMD's AOCL BLIS (#448).
  
Details:
- Added support for level-3 operation gemmt, which performs a gemm on
  only the lower or upper triangle of a square matrix C. For now, only
  the conventional/large code path will be supported (in vanilla BLIS).
  This was accomplished by leveraging the existing variant logic for
  herk. However, some of the infrastructure to support a gemmtsup is
  included in this commit, including
  - A bli_gemmtsup() front-end, similar to bli_gemmsup().
  - A bli_gemmtsup_ref() reference handler function.
  - A bli_gemmtsup_int() variant chooser function (with variant calls
    commented out).
- Added support for inducing complex domain gemmt via the 1m method.
- Added gemmt APIs to the BLAS and CBLAS compatiblity layers.
- Added gemmt test module to testsuite.
- Added standalone gemmt test driver to 'test' directory.
- Documented gemmt APIs in BLISObjectAPI.md and BLISTypedAPI.md.
- Added a C++ template header (blis.hh) containing a BLAS-inspired
  wrapper to a set of polymorphic CBLAS-like function wrappers defined
  in another header (cblas.hh). These two headers are installed if
  running the 'install' target with INSTALL_HH is set to 'yes'. (Also
  added a set of unit tests that exercise blis.hh, although they are
  disabled for now because they aren't compatible with out-of-tree
  builds.) These files now live in the 'vendor' top-level directory.
- Various updates to 'zen' and 'zen2' subconfigurations, particularly
  within the context initialization functions.
- Added s and d copyv, setv, and swapv kernels to kernels/zen/1, and
  various minor updates to dotv and scalv kernels. Also added various
  sup kernels contributed by AMD to kernels/zen/3. However, these
  kernels are (for now) not yet used, in part because they caused
  AppVeyor clang failures, and also because I have not found time to
  review and vet them.
- Output the python found during configure into the definition of PYTHON
  in build/config.mk (via build/config.mk.in).
- Added early-return checks (A, B, or C with zero dimension; alpha = 0)
  to bli_gemm_front.c.
- Implemented explicit beta = 0 handling in for the sgemm ukernel in
  bli_gemm_armv7a_int_d4x4.c, which was previously missing. This latent
  bug surfaced because the gemmt module verifies its computation using
  gemm with its beta parameter set to zero, which, on a cortexa15 system
  caused the gemm kernel code to unconditionally multiply the
  uninitialized C data by beta. The C matrix likely contained
  non-numeric values such as NaN, which then would have resulted in a
  false failure.
- Fixed a bug whereby the implementation for bli_herk_determine_kc(),
  in bli_l3_blocksize.c, was inadvertantly being defined in terms of
  helper functions meant for trmm. This bug was probably harmless since
  the trmm code should have also done the right thing for herk.
- Used cpp macros to neutralize the various AOCL_DTL_TRACE_ macros in
  kernels/zen/3/bli_gemm_small.c since those macros are not used in
  vanilla BLIS.
- Added cpp guard to definition of bli_mem_clear() in bli_mem.h to
  accommodate C++'s stricter type checking.
- Added cpp guard to test/*.c drivers that facilitate compilation on
  Windows systems.
- Various whitespace changes.
This commit is contained in:
Field G. Van Zee
2020-11-14 09:39:48 -06:00
committed by GitHub
parent 234b8b0cf4
commit 88ad841434
163 changed files with 106563 additions and 9683 deletions

7
.gitignore vendored
View File

@@ -43,7 +43,12 @@ include/*/*.h
# -- misc. --
# BLIS testsuite output file
output.testsuite
output.testsuite.*
# BLAS test output files
out.*
# GTAGS database
GPATH
GRTAGS
GTAGS

View File

@@ -249,6 +249,12 @@ ifeq ($(MK_ENABLE_CBLAS),yes)
HEADERS_TO_INSTALL += $(CBLAS_H_FLAT)
endif
# If requested, include AMD's C++ template header files in the list of headers
# to install.
ifeq ($(INSTALL_HH),yes)
HEADERS_TO_INSTALL += $(wildcard $(VEND_CPP_PATH)/*.hh)
endif
#
@@ -892,6 +898,19 @@ else
@- $(TESTSUITE_CHECK_PATH) $(TESTSUITE_OUT_FILE)
endif
# --- AMD's C++ template header test rules ---
# NOTE: The targets below won't work as intended for an out-of-tree build,
# and so it's disabled for now.
#testcpp: testvendcpp
# Recursively run the test for AMD's C++ template header.
#testvendcpp:
# $(MAKE) -C $(VEND_TESTCPP_PATH)
# --- Install header rules ---
install-headers: check-env $(MK_INCL_DIR_INST)
@@ -1167,11 +1186,13 @@ ifeq ($(IS_CONFIGURED),yes)
ifeq ($(ENABLE_VERBOSE),yes)
- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
# - $(MAKE) -C $(VEND_TESTCPP_DIR) clean
else
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
# @$(MAKE) -C $(VEND_TESTCPP_DIR) clean
endif # ENABLE_VERBOSE
endif # IS_CONFIGURED

View File

@@ -103,6 +103,9 @@ RANLIB := @RANLIB@
# Archiver.
AR := @AR@
# Python Interpreter
PYTHON := @PYTHON@
# Preset (required) CFLAGS and LDFLAGS. These variables capture the value
# of the CFLAGS and LDFLAGS environment variables at configure-time (and/or
# the value of CFLAGS/LDFLAGS if either was specified on the command line).

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,6 +5,7 @@
# libraries.
#
# Copyright (C) 2019, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are

View File

@@ -299,6 +299,10 @@ INCLUDE_DIR := include
BLASTEST_DIR := blastest
TESTSUITE_DIR := testsuite
VEND_DIR := vendor
VEND_CPP_DIR := $(VEND_DIR)/cpp
VEND_TESTCPP_DIR := $(VEND_DIR)/testcpp
# The filename suffix for reference kernels.
REFNM := ref
@@ -358,6 +362,10 @@ REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR)
KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR)
SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR)
# Construct paths to some optional C++ template headers contributed by AMD.
VEND_CPP_PATH := $(DIST_PATH)/$(VEND_CPP_DIR)
VEND_TESTCPP_PATH := $(DIST_PATH)/$(VEND_TESTCPP_DIR)
# Construct paths to the makefile fragments for the four primary directories
# of source code: the config directory, general framework code, reference
# kernel code, and optimized kernel code.

View File

@@ -55,11 +55,19 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
#if 1
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 176, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 368, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.

View File

@@ -67,6 +67,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
@@ -90,11 +91,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
@@ -106,9 +107,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,

View File

@@ -60,10 +60,8 @@ ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
else
ifeq ($(CC_VENDOR),clang)
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
# When compiling with AOCC, add these flags to the default flags set above.
ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM.2.0.0')),1)
ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM')),1)
CKVECFLAGS += -mllvm -disable-licm-vrp
endif
else

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -52,27 +52,43 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
#if 0
// Update the context with optimized level-1m (packm) kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -83,11 +99,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
@@ -96,12 +112,21 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
#if 0
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
@@ -110,6 +135,16 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
#if 0
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
#endif
cntx
);
@@ -125,29 +160,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
a) If BLIS is run in a multi-instance mode with
CPU freq 2.6/2.2 Ghz
DDR4 clock frequency 2400Mhz
mc = 240, kc = 512, and nc = 2040
mc = 240, kc = 512, and nc = 2040
has better performance on EPYC server, over the default block sizes.
b) If BLIS is run in Single Instance mode
mc = 510, kc = 1024 and nc = 4080
mc = 510, kc = 1024 and nc = 4080
*/
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
// Zen optmized level 3 cache block sizes
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
#endif
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
@@ -171,10 +199,10 @@ void bli_cntx_init_zen( cntx_t* cntx )
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 256, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 256, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 220, 220, -1, -1 );
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
@@ -186,15 +214,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
//BLIS_GEMMT, bli_gemmtsup_ref,
cntx
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
@@ -218,6 +245,33 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);
@@ -227,9 +281,17 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
#if 0
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.

View File

@@ -65,6 +65,17 @@
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
#if 0
// Allow the sup implementation to combine some small edge case iterations in
// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
// block-panel algorithm (NR) with the last full iteration that precedes it.
// NOTE: These cpp macros need to be explicitly set to an integer since they
// are used at compile-time to create unconditional branches or dead code
// regions.
#define BLIS_ENABLE_SUP_MR_EXT 1
#define BLIS_ENABLE_SUP_NR_EXT 0
#endif
//#endif

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -64,13 +64,24 @@ void bli_cntx_init_zen2( cntx_t* cntx )
cntx
);
#if 0
// Update the context with optimized level-1m (packm) kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
@@ -80,28 +91,39 @@ void bli_cntx_init_zen2( cntx_t* cntx )
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
16,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
//swap
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
//copy
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
);
@@ -119,7 +141,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
@@ -195,6 +217,33 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);

View File

@@ -60,11 +60,27 @@
#define BLIS_ENABLE_SMALL_MATRIX_ROME
#define BLIS_SMALL_MATRIX_THRES_ROME 400
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 60
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50
// When running HPL with pure MPI without DGEMM threading (Single-threaded
// BLIS), defining this macro as 1 yields better performance.

8
configure vendored
View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
# Copyright (C) 2020, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -1363,6 +1363,9 @@ get_compiler_version()
if [ "${cc_vendor}" = "icc" -o \
"${cc_vendor}" = "gcc" ]; then
cc_version=$(${cc} -dumpversion)
# If compiler is AOCC, first grep for clang and then the version number.
elif [ "${cc_vendor}" = "clang" ]; then
cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')
elif [ "${cc_vendor}" = "oneAPI" ]; then
# Treat Intel oneAPI's clang as clang, not icc.
cc_vendor="clang"
@@ -3107,6 +3110,7 @@ main()
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
python_esc=$(echo "${found_python}" | sed 's/\//\\\//g')
#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')
# For RANLIB, if the variable is not set, we use a default value of
@@ -3211,6 +3215,7 @@ main()
| sed -e "s/@CXX@/${cxx_esc}/g" \
| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
| sed -e "s/@AR@/${ar_esc}/g" \
| sed -e "s/@PYTHON@/${python_esc}/g" \
| sed -e "s/@libpthread@/${libpthread_esc}/g" \
| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
@@ -3311,7 +3316,6 @@ main()
echo "${script_name}: creating ${obj_frame_dirpath}"
mkdir -p ${obj_frame_dirpath}
if [ -n "${sandbox_flag}" ]; then
obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"

View File

@@ -1681,6 +1681,27 @@ Observed object properties: `trans?(A)`, `trans?(B)`.
---
#### gemmt
```c
void bli_gemmt
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c
);
```
Perform
```
C := beta * C + alpha * trans?(A) * trans?(B)
```
where `C` is an _m x m_ matrix, `trans?(A)` is an _m x k_ matrix, and `trans?(B)` is a _k x m_ matrix. This operation is similar to `bli_gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uplo(C)`.
Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
---
#### hemm
```c
void bli_hemm

View File

@@ -1213,6 +1213,30 @@ where C is an _m x n_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
---
#### gemmt
```c
void bli_?gemmt
(
uplo_t uploc,
trans_t transa,
trans_t transb,
dim_t m,
dim_t k,
ctype* alpha,
ctype* a, inc_t rsa, inc_t csa,
ctype* b, inc_t rsb, inc_t csb,
ctype* beta,
ctype* c, inc_t rsc, inc_t csc
);
```
Perform
```
C := beta * C + alpha * transa(A) * transb(B)
```
where C is an _m x m_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)` is a _k x m_ matrix. This operation is similar to `bli_?gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uploc`.
---
#### hemm
```c
void bli_?hemm

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -97,4 +97,4 @@
#include "bli_trmm.h"
#include "bli_trmm3.h"
#include "bli_trsm.h"
#include "bli_gemmt.h"

View File

@@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \
}
GENFRONT( gemm_determine_kc, gemm )
GENFRONT( herk_determine_kc, trmm )
GENFRONT( herk_determine_kc, herk )
GENFRONT( trmm_determine_kc, trmm )
GENFRONT( trsm_determine_kc, trsm )

View File

@@ -63,6 +63,28 @@ void bli_gemm_check
//bli_check_error_code( e_val );
}
void bli_gemmt_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Check basic properties of the operation.
bli_gemmt_basic_check( alpha, a, b, beta, c, cntx );
// Check matrix squareness.
e_val = bli_check_square_object( c );
bli_check_error_code( e_val );
}
void bli_hemm_check
(
side_t side,
@@ -324,6 +346,28 @@ void bli_gemm_basic_check
#endif
}
void bli_gemmt_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, b, beta, c, cntx );
// Check object dimensions.
e_val = bli_check_level3_dims( a, b, c );
bli_check_error_code( e_val );
}
void bli_hemm_basic_check
(
side_t side,

View File

@@ -51,6 +51,7 @@ void PASTEMAC(opname,_check) \
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )
@@ -103,6 +104,16 @@ void bli_gemm_basic_check
cntx_t* cntx
);
void bli_gemmt_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_hemm_basic_check
(
side_t side,

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -71,7 +71,10 @@ void PASTEMAC(opname,EX_SUF) \
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) return; \
if ( result == BLIS_SUCCESS ) \
{ \
return; \
} \
} \
\
/* Only proceed with an induced method if each of the operands have a
@@ -101,6 +104,75 @@ void PASTEMAC(opname,EX_SUF) \
GENFRONT( gemm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
/*
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
*/ \
\
/* NOTE: The sup handling for gemmt is disabled here because gemmtsup
is not yet fully implemented. */ \
/*
if ( enable_sup ) \
{ \
*/ \
/* Execute the small/unpacked oapi handler. If it finds that the problem
does not fall within the thresholds that define "small", or for some
other reason decides not to use the small/unpacked implementation,
the function returns with BLIS_FAILURE, which causes execution to
proceed towards the conventional implementation. */ \
/*
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
if ( result == BLIS_SUCCESS ) \
{ \
return; \
} \
} \
*/ \
\
/* Only proceed with an induced method if each of the operands have a
complex storage datatype. NOTE: Allowing precisions to vary while
using 1m, which is what we do here, is unique to gemm; other level-3
operations use 1m only if all storage datatypes are equal (and they
ignore the computation precision). If any operands are real, skip the
induced method chooser function and proceed directly with native
execution. */ \
if ( bli_obj_is_complex( c ) && \
bli_obj_is_complex( a ) && \
bli_obj_is_complex( b ) ) \
{ \
/* FIXME: BLIS does not yet support induced methods for gemmt. Thus,
we call the native implementation code path for now. */ \
/*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
else \
{ \
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
} \
}
GENFRONT( gemmt )
#undef GENFRONT
#define GENFRONT( opname ) \
\

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -51,6 +52,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,6 +58,7 @@ typedef void (*PASTECH(opname,_oft)) \
);
GENTDEF( gemm )
GENTDEF( gemmt )
GENTDEF( her2k )
GENTDEF( syr2k )

View File

@@ -132,3 +132,72 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
}
err_t bli_gemmtsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Return early if small matrix handling is disabled at configure-time.
#ifdef BLIS_DISABLE_SUP_HANDLING
return BLIS_FAILURE;
#endif
// Return early if this is a mixed-datatype computation.
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
bli_obj_dt( c ) != bli_obj_dt( b ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE;
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Return early if the problem dimensions exceed their sup thresholds.
// Notice that we do not bother to check whether the microkernel
// prefers or dislikes the storage of C, since the same check is called
// for either way.
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
return BLIS_FAILURE;
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// We've now ruled out the possibility that the sup thresholds are
// unsatisfied.
// This implies that the sup thresholds (at least one of them) are met.
// and the small/unpacked handler should be called.
// NOTE: The sup handler is free to enforce a stricter threshold regime
// if it so chooses, in which case it can/should return BLIS_FAILURE.
// Query the small/unpacked handler from the context and invoke it.
gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
return
gemmtsup_fp
(
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -43,3 +43,14 @@ err_t bli_gemmsup
rntm_t* rntm
);
err_t bli_gemmtsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -80,7 +80,10 @@ err_t bli_gemmsup_int
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
if ( stor_id == BLIS_XXX )
{
return BLIS_FAILURE;
}
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
@@ -240,3 +243,192 @@ err_t bli_gemmsup_int
return BLIS_SUCCESS;
}
// -----------------------------------------------------------------------------
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX )
{
return BLIS_FAILURE;
}
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = m;
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
#if 0
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
#if 0
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
// *requires nudging of nc up to be a multiple of mr.
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
#if 0
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
#if 0
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
// *requires nudging of mc up to be a multiple of nr.
}
}
// Return success so that the caller knows that we computed the solution.
return BLIS_SUCCESS;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -43,3 +43,15 @@ err_t bli_gemmsup_int
rntm_t* rntm,
thrinfo_t* thread
);
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019-20, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,6 +57,6 @@ typedef err_t (*PASTECH(opname,_oft)) \
);
GENTDEF( gemmsup )
GENTDEF( gemmtsup )
#endif

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -106,3 +106,69 @@ err_t bli_gemmsup_ref
);
}
// -----------------------------------------------------------------------------
err_t bli_gemmtsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// This function implements the default gemmtsup handler. If you are a
// BLIS developer and wish to use a different gemmtsup handler, please
// register a different function pointer in the context in your
// sub-configuration's bli_cntx_init_*() function.
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemmt_check( alpha, a, b, beta, c, cntx );
#if 0
// NOTE: This special case handling is done within the variants.
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return BLIS_SUCCESS;
}
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop.
bli_rntm_set_ways_from_rntm_sup
(
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
return
bli_l3_sup_thread_decorator
(
bli_gemmtsup_int,
BLIS_GEMMT, // operation family id
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -43,3 +43,14 @@ err_t bli_gemmsup_ref
rntm_t* rntm
);
err_t bli_gemmtsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -100,7 +100,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
}
INSERT_GENTFUNC_BASIC0( gemm )
INSERT_GENTFUNC_BASIC0( gemmt )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, struca ) \

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -56,7 +57,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
);
INSERT_GENTPROT_BASIC0( gemm )
INSERT_GENTPROT_BASIC0( gemmt )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -53,6 +53,26 @@ void bli_gemm_front
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX
// Only handle small problems separately for homogeneous datatypes.
@@ -60,23 +80,12 @@ void bli_gemm_front
bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
{
gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
}
#endif
#endif
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );

View File

@@ -58,15 +58,18 @@ void bli_gemm_int
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) ) return;
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
@@ -78,9 +81,9 @@ void bli_gemm_int
// This should never execute.
bli_abort();
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
@@ -93,14 +96,14 @@ void bli_gemm_int
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Create the next node in the thrinfo_t structure.
@@ -129,7 +132,7 @@ void bli_gemm_int
cntx,
rntm,
cntl,
thread
thread
);
}

36
frame/3/gemmt/bli_gemmt.h Normal file
View File

@@ -0,0 +1,36 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_gemmt_front.h"

View File

@@ -0,0 +1,142 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemmt_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemmt_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
bli_obj_set_as_root( &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
// in bli_packm_init().
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
{
pack_t schema_a = bli_cntx_schema_a_block( cntx );
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
bli_obj_set_pack_schema( schema_a, &a_local );
bli_obj_set_pack_schema( schema_b, &b_local );
}
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
BLIS_HERK, // operation family id (gemmt uses 'herk' family)
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}

View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemmt_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);

View File

@@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level );
BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void );
void bli_print_msg( char* str, char* file, guint_t line );
BLIS_EXPORT_BLIS void bli_abort( void );
void bli_print_msg( char* str, char* file, guint_t line );
BLIS_EXPORT_BLIS void bli_abort( void );
char* bli_error_string_for_code( gint_t code );
char* bli_error_string_for_code( gint_t code );

View File

@@ -147,7 +147,14 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
BLIS_INLINE void bli_mem_clear( mem_t* mem )
{
bli_mem_set_buffer( NULL, mem );
#ifdef __cplusplus
const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE;
// When using C++, which is strongly typed, we avoid use of -1 as a
// packbuf_t value since it will result in a compile-time error.
bli_mem_set_buf_type( pb, mem );
#else
bli_mem_set_buf_type( ( packbuf_t )-1, mem );
#endif
bli_mem_set_pool( NULL, mem );
bli_mem_set_size( 0, mem );
}

234
frame/compat/bla_gemmt.c Normal file
View File

@@ -0,0 +1,234 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, k0; \
inc_t rs_a, cs_a; \
inc_t rs_b, cs_b; \
inc_t rs_c, cs_c; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
transb, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
rs_a = 1; \
cs_a = *lda; \
rs_b = 1; \
cs_b = *ldb; \
rs_c = 1; \
cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploc, \
blis_transa, \
blis_transb, \
m0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
transb, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const struc_t strucc = BLIS_SYMMETRIC; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( blis_transb, k0, m0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploc, &co ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( blis_transb, &bo ); \
\
bli_obj_set_struc( strucc, &co ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( gemmt, gemmt )
#endif

60
frame/compat/bla_gemmt.h Normal file
View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( gemmt )
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -185,6 +186,7 @@
#include "bla_syr2k.h"
#include "bla_trmm.h"
#include "bla_trsm.h"
#include "bla_gemmt.h"
#include "bla_gemm_check.h"
#include "bla_hemm_check.h"
@@ -195,6 +197,7 @@
#include "bla_syr2k_check.h"
#include "bla_trmm_check.h"
#include "bla_trsm_check.h"
#include "bla_gemmt_check.h"
// -- Fortran-compatible APIs to BLIS functions --

View File

@@ -448,6 +448,11 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
float alpha, const float *A, f77_int lda,
float *B, f77_int ldb);
void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
f77_int N, f77_int K, float alpha, const float *A,
f77_int lda, const float *B, f77_int ldb,
float beta, float *C, f77_int ldc);
void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -478,6 +483,11 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
double alpha, const double *A, f77_int lda,
double *B, f77_int ldb);
void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
f77_int N, f77_int K, double alpha, const double *A,
f77_int lda, const double *B, f77_int ldb,
double beta, double *C, f77_int ldc);
void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -508,6 +518,11 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
const void *alpha, const void *A, f77_int lda,
void *B, f77_int ldb);
void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
f77_int N, f77_int K, const void *alpha, const void *A,
f77_int lda, const void *B, f77_int ldb,
const void *beta, void *C, f77_int ldc);
void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
@@ -538,6 +553,11 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
const void *alpha, const void *A, f77_int lda,
void *B, f77_int ldb);
void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
f77_int N, f77_int K, const void *alpha, const void *A,
f77_int lda, const void *B, f77_int ldb,
const void *beta, void *C, f77_int ldc);
/*

View File

@@ -0,0 +1,166 @@
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
/*
cblas_cgemmt.c
Based off of cblas_cgemm.c.
*/
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cblas.h"
#include "cblas_f77.h"
void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
const void *alpha, const void *A,
f77_int lda, const void *B, f77_int ldb,
const void *beta, void *C, f77_int ldc)
{
char UL, TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_UL, F77_TA, F77_TB;
#else
#define F77_UL &UL
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if( Order == CblasColMajor )
{
if( Uplo == CblasUpper) UL='U';
else if ( Uplo == CblasLower ) UL='L';
else
{
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TA='T';
else if ( TransA == CblasConjTrans ) TA='C';
else if ( TransA == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TB='T';
else if ( TransB == CblasConjTrans ) TB='C';
else if ( TransB == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)A,
&F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc);
} else if (Order == CblasRowMajor)
{
RowMajorStrg = 1;
if( Uplo == CblasUpper) UL='L';
else if ( Uplo == CblasLower ) UL='U';
else
{
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TB='T';
else if ( TransA == CblasConjTrans ) TB='C';
else if ( TransA == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TA='T';
else if ( TransB == CblasConjTrans ) TA='C';
else if ( TransB == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B,
&F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
}
else cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -0,0 +1,166 @@
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
/*
cblas_dgemmt.c
Based off of cblas_dgemm.c.
*/
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cblas.h"
#include "cblas_f77.h"
void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
double alpha, const double *A,
f77_int lda, const double *B, f77_int ldb,
double beta, double *C, f77_int ldc)
{
char UL, TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_UL, F77_TA, F77_TB;
#else
#define F77_UL &UL
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if( Order == CblasColMajor )
{
if( Uplo == CblasUpper) UL='U';
else if ( Uplo == CblasLower ) UL='L';
else
{
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TA='T';
else if ( TransA == CblasConjTrans ) TA='C';
else if ( TransA == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TB='T';
else if ( TransB == CblasConjTrans ) TB='C';
else if ( TransB == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
} else if (Order == CblasRowMajor)
{
RowMajorStrg = 1;
if( Uplo == CblasUpper) UL='L';
else if ( Uplo == CblasLower ) UL='U';
else
{
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TB='T';
else if ( TransA == CblasConjTrans ) TB='C';
else if ( TransA == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TA='T';
else if ( TransB == CblasConjTrans ) TA='C';
else if ( TransB == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
}
else cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -1,12 +1,46 @@
/*
* cblas_f77.h
* Written by Keita Teranishi
*
* Updated by Jeff Horner
* Merged cblas_f77.h and cblas_fortran_header.h
*
* (Heavily hacked down from the original)
*/
cblas_f77.h
Written by Keita Teranishi
Updated by Jeff Horner
Merged cblas_f77.h and cblas_fortran_header.h
(Heavily hacked down from the original)
*/
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CBLAS_F77_H
#define CBLAS_F77_H
@@ -163,5 +197,12 @@
#define F77_zsyr2k zsyr2k_
#define F77_ztrmm ztrmm_
#define F77_ztrsm ztrsm_
/*
* BLAS extensions
*/
#define F77_sgemmt sgemmt_
#define F77_dgemmt dgemmt_
#define F77_cgemmt cgemmt_
#define F77_zgemmt zgemmt_
#endif /* CBLAS_F77_H */

View File

@@ -0,0 +1,166 @@
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
/*
cblas_sgemmt.c
Based off of cblas_sgemm.c.
*/
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cblas.h"
#include "cblas_f77.h"
void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
float alpha, const float *A,
f77_int lda, const float *B, f77_int ldb,
float beta, float *C, f77_int ldc)
{
char UL, TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_UL, F77_TA, F77_TB;
#else
#define F77_UL &UL
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if( Order == CblasColMajor )
{
if( Uplo == CblasUpper) UL='U';
else if ( Uplo == CblasLower ) UL='L';
else
{
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TA='T';
else if ( TransA == CblasConjTrans ) TA='C';
else if ( TransA == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TB='T';
else if ( TransB == CblasConjTrans ) TB='C';
else if ( TransB == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
} else if (Order == CblasRowMajor)
{
RowMajorStrg = 1;
if( Uplo == CblasUpper) UL='L';
else if ( Uplo == CblasLower ) UL='U';
else
{
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TB='T';
else if ( TransA == CblasConjTrans ) TB='C';
else if ( TransA == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TA='T';
else if ( TransB == CblasConjTrans ) TA='C';
else if ( TransB == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
}
else cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -0,0 +1,166 @@
#include "blis.h"
#ifdef BLIS_ENABLE_CBLAS
/*
cblas_zgemmt.c
Based off of cblas_zgemm.c.
*/
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cblas.h"
#include "cblas_f77.h"
void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA,
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
const void *alpha, const void *A,
f77_int lda, const void *B, f77_int ldb,
const void *beta, void *C, f77_int ldc)
{
char UL, TA, TB;
#ifdef F77_CHAR
F77_CHAR F77_UL, F77_TA, F77_TB;
#else
#define F77_UL &UL
#define F77_TA &TA
#define F77_TB &TB
#endif
#ifdef F77_INT
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
F77_INT F77_ldc=ldc;
#else
#define F77_M M
#define F77_K K
#define F77_lda lda
#define F77_ldb ldb
#define F77_ldc ldc
#endif
extern int CBLAS_CallFromC;
extern int RowMajorStrg;
RowMajorStrg = 0;
CBLAS_CallFromC = 1;
if( Order == CblasColMajor )
{
if( Uplo == CblasUpper) UL='U';
else if ( Uplo == CblasLower ) UL='L';
else
{
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TA='T';
else if ( TransA == CblasConjTrans ) TA='C';
else if ( TransA == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TB='T';
else if ( TransB == CblasConjTrans ) TB='C';
else if ( TransB == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)A,
&F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
} else if (Order == CblasRowMajor)
{
RowMajorStrg = 1;
if( Uplo == CblasUpper) UL='L';
else if ( Uplo == CblasLower ) UL='U';
else
{
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransA == CblasTrans) TB='T';
else if ( TransA == CblasConjTrans ) TB='C';
else if ( TransA == CblasNoTrans ) TB='N';
else
{
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
if(TransB == CblasTrans) TA='T';
else if ( TransB == CblasConjTrans ) TA='C';
else if ( TransB == CblasNoTrans ) TA='N';
else
{
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#ifdef F77_CHAR
F77_UL = C2F_CHAR(&UL);
F77_TA = C2F_CHAR(&TA);
F77_TB = C2F_CHAR(&TB);
#endif
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
&F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
}
else cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order);
CBLAS_CallFromC = 0;
RowMajorStrg = 0;
return;
}
#endif

View File

@@ -0,0 +1,92 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef BLIS_ENABLE_BLAS
#define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \
{ \
f77_int info = 0; \
f77_int nota, notb; \
f77_int conja, conjb; \
f77_int ta, tb; \
f77_int lower, upper; \
f77_int nrowa, nrowb; \
\
nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
\
lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \
upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \
\
if ( nota ) { nrowa = *m; } \
else { nrowa = *k; } \
if ( notb ) { nrowb = *k; } \
else { nrowb = *m; } \
\
if ( !lower && !upper ) \
info = 1; \
else if ( !nota && !conja && !ta ) \
info = 2; \
else if ( !notb && !conjb && !tb ) \
info = 3; \
else if ( *m < 0 ) \
info = 4; \
else if ( *k < 0 ) \
info = 5; \
else if ( *lda < bli_max( 1, nrowa ) ) \
info = 8; \
else if ( *ldb < bli_max( 1, nrowb ) ) \
info = 10; \
else if ( *ldc < bli_max( 1, *m ) ) \
info = 13; \
\
if ( info != 0 ) \
{ \
char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
\
sprintf( func_str, "%s%-5s", dt_str, op_str ); \
\
bli_string_mkupper( func_str ); \
\
PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
\
return; \
} \
}
#endif

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -250,9 +250,9 @@ CNTX_INIT_PROTS( generic )
// -- AMD64 architectures --
//#ifdef BLIS_KERNELS_ZEN2
//#include "bli_kernels_zen2.h"
//#endif
#ifdef BLIS_KERNELS_ZEN2
#include "bli_kernels_zen2.h"
#endif
#ifdef BLIS_KERNELS_ZEN
#include "bli_kernels_zen.h"
#endif

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -918,6 +918,7 @@ typedef enum
// bli_l3_ind.c to index into arrays.
//
BLIS_GEMM = 0,
BLIS_GEMMT,
BLIS_HEMM,
BLIS_HERK,
BLIS_HER2K,
@@ -931,7 +932,7 @@ typedef enum
BLIS_NOID
} opid_t;
#define BLIS_NUM_LEVEL3_OPS 10
#define BLIS_NUM_LEVEL3_OPS 11
// -- Blocksize ID type --

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -37,48 +37,49 @@
static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
{
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
/* 3mh */ { bli_gemm3mh, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
/* 3mh */ { bli_gemm3mh, NULL, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL },
/* 3m1 */ { bli_gemm3m1, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
/* 3m1 */ { bli_gemm3m1, NULL, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 },
/* 4mh */ { bli_gemm4mh, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
/* 4mh */ { bli_gemm4mh, NULL, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL },
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL,
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL },
/* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
/* 4m1 */ { bli_gemm4m1, NULL, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 },
/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
/* 1m */ { bli_gemm1m, NULL, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m },
/* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat },
};
//
// NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
//
// BLIS provides APIs to modify this state during runtime. So, one application thread
// can modify the state, before another starts the corresponding BLIS operation.
// This is solved by making the induced method status array local to threads.
// BLIS provides APIs to modify this state during runtime. So, it's possible for one
// application thread to modify the state before another starts the corresponding
// BLIS operation. This is solved by making the induced method status array local to
// threads.
static BLIS_THREAD_LOCAL
bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
{
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
/* c z */
/* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 3m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 3m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 4mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4mb */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 4mb */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
{TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} },
};
@@ -99,6 +100,7 @@ bool PASTEMAC(opname,ind_has_avail)( num_t dt )
*/
GENFUNC( gemm, BLIS_GEMM )
GENFUNC( gemmt, BLIS_GEMMT )
GENFUNC( hemm, BLIS_HEMM )
GENFUNC( herk, BLIS_HERK )
GENFUNC( her2k, BLIS_HER2K )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -44,6 +45,7 @@ void_fp PASTEMAC(opname,ind_get_avail)( num_t dt );
/*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( hemm )
GENPROT( herk )
GENPROT( her2k )

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -67,6 +67,7 @@ void PASTEMAC(opname,imeth) \
}
GENFRONT( gemm, ind )
GENFRONT( gemmt, ind )
GENFRONT( her2k, ind )
GENFRONT( syr2k, ind )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -41,6 +42,7 @@
#define GENPROT( imeth ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
// of executing one iteration of a for loop, plus the overhead of calling a
// function that does nothing (ie: the _cntx_init_stage() function).
// -- gemm/her2k/syr2k ---------------------------------------------------------
// -- gemm/her2k/syr2k/gemmt ---------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, cname, imeth ) \
@@ -80,6 +80,7 @@ void PASTEMAC(opname,imeth) \
#ifndef BLIS_ENABLE_SANDBOX
GENFRONT( gemm, gemm, nat )
#endif
GENFRONT( gemmt, gemm, nat )
GENFRONT( her2k, gemm, nat )
GENFRONT( syr2k, gemm, nat )

View File

@@ -80,46 +80,56 @@ void bli_sgemm_armv7a_int_4x4
// Vector for column 3
float32x4_t cv3;
if( rs_c == 1 )
if ( *beta != 0.0F )
{
// Load column 0
cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
// Load column 1
cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
// Load column 2
cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
// Load column 3
cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
}
if ( rs_c == 1 )
{
// Load column 0
cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
// Load column 1
cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
// Load column 2
cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
// Load column 3
cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
}
else
{
// Load column 0
cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
// Load column 1
cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
// Load column 2
cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
// Load column 3
cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
}
}
else
{
// Load column 0
cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
// Load column 1
cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
// Load column 2
cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
// Load column 3
cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
cv0 = vmovq_n_f32( 0.0 );
cv1 = vmovq_n_f32( 0.0 );
cv2 = vmovq_n_f32( 0.0 );
cv3 = vmovq_n_f32( 0.0 );
}
// Vector for accummulating column 0
@@ -142,15 +152,15 @@ void bli_sgemm_armv7a_int_4x4
// Initialize vector to 0.0
abv3 = vmovq_n_f32( 0.0 );
for ( i = 0; i < k_iter; ++i )
{
for ( i = 0; i < k_iter; ++i )
{
// Begin iter 0
av1 = vld1q_f32( a );
av1 = vld1q_f32( a );
__builtin_prefetch( a + 224 );
__builtin_prefetch( b + 224 );
bv1 = vld1q_f32( b );
bv1 = vld1q_f32( b );
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
@@ -158,24 +168,24 @@ void bli_sgemm_armv7a_int_4x4
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
av2 = vld1q_f32( a+4 );
av2 = vld1q_f32( a+4 );
//__builtin_prefetch( a + 116 );
//__builtin_prefetch( b + 116 );
bv2 = vld1q_f32( b+4 );
bv2 = vld1q_f32( b+4 );
abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 );
abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 );
abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 );
abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 );
av3 = vld1q_f32( a+8 );
av3 = vld1q_f32( a+8 );
//__builtin_prefetch( a + 120 );
//__builtin_prefetch( b + 120 );
bv3 = vld1q_f32( b+8 );
bv3 = vld1q_f32( b+8 );
abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 );
abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 );
@@ -183,12 +193,12 @@ void bli_sgemm_armv7a_int_4x4
abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 );
av4 = vld1q_f32( a+12);
av4 = vld1q_f32( a+12);
//__builtin_prefetch( a + 124 );
//__builtin_prefetch( b + 124 );
bv4 = vld1q_f32( b+12);
bv4 = vld1q_f32( b+12);
abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 );
abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 );
@@ -197,71 +207,85 @@ void bli_sgemm_armv7a_int_4x4
a += 16;
b += 16;
}
a += 16;
b += 16;
}
for ( i = 0; i < k_left; ++i )
{
av1 = vld1q_f32( a );
for ( i = 0; i < k_left; ++i )
{
av1 = vld1q_f32( a );
__builtin_prefetch( a + 112 );
__builtin_prefetch( b + 112 );
bv1 = vld1q_f32( b );
bv1 = vld1q_f32( b );
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 );
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
a += 4;
b += 4;
a += 4;
b += 4;
}
__builtin_prefetch( a_next );
__builtin_prefetch( b_next );
cv0 = vmulq_n_f32( cv0, *beta );
cv1 = vmulq_n_f32( cv1, *beta );
cv2 = vmulq_n_f32( cv2, *beta );
cv3 = vmulq_n_f32( cv3, *beta );
if ( *beta != 0.0F )
{
// Multiply C by beta and then accumulate alpha * A * B.
cv0 = vmulq_n_f32( cv0, *beta );
cv1 = vmulq_n_f32( cv1, *beta );
cv2 = vmulq_n_f32( cv2, *beta );
cv3 = vmulq_n_f32( cv3, *beta );
cv0 = vmlaq_f32( cv0, abv0, alphav );
cv1 = vmlaq_f32( cv1, abv1, alphav );
cv2 = vmlaq_f32( cv2, abv2, alphav );
cv3 = vmlaq_f32( cv3, abv3, alphav );
cv0 = vmlaq_f32( cv0, abv0, alphav );
cv1 = vmlaq_f32( cv1, abv1, alphav );
cv2 = vmlaq_f32( cv2, abv2, alphav );
cv3 = vmlaq_f32( cv3, abv3, alphav );
}
else
{
// Since beta = 0, skip straight to accumulating alpha * A * B.
// Note: C (cv?) was initialized to zero above.
cv0 = vmlaq_f32( cv0, abv0, alphav );
cv1 = vmlaq_f32( cv1, abv1, alphav );
cv2 = vmlaq_f32( cv2, abv2, alphav );
cv3 = vmlaq_f32( cv3, abv3, alphav );
}
if( rs_c == 1 )
if ( rs_c == 1 )
{
// Store column 0
vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
// Store column 1
vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
// Store column 2
vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
// Store column 3
vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
}
else{
else
{
// Store column 0
vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
// Store column 1
vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
// Store column 2
vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
// Store column 3
vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);

View File

@@ -0,0 +1,330 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
// -----------------------------------------------------------------------------
void bli_scopyv_zen_int
(
conj_t conjx,
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 8;
dim_t i = 0;
__m256 xv[16];
// If the vector dimension is zero return early.
if ( bli_zero_dim1( n ) ) return;
if ( incx == 1 && incy == 1 )
{
#if 0
PRAGMA_SIMD
for (i = 0; i < n; i++)
{
y[i] = x[i];
}
#endif
#if 0
memcpy(y, x, n << 2);
#endif
#if 1
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
// for example if n = 255
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
for ( i = 0; i < (n & (~0x7F)); i += 128 )
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
y += 128;
x += 128;
}
for ( ; i < (n & (~0x3F)); i += 64 )
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
y += 64;
x += 64;
}
for ( ; i < (n & (~0x1F)); i += 32 )
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
y += 32;
x += 32;
}
for ( ; i < (n & (~0x0F)); i += 16 )
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
y += 16;
x += 16;
}
for ( ; i < (n & (~0x07)); i += 8 )
{
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
y += 8;
x += 8;
}
for ( ; i < n; ++i )
{
*y++ = *x++;
}
#endif
}
else
{
for ( dim_t i = 0; i < n; ++i )
{
*y = *x;
x += incx;
y += incy;
}
}
}
// -----------------------------------------------------------------------------
void bli_dcopyv_zen_int
(
conj_t conjx,
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 4;
dim_t i = 0;
__m256d xv[16];
// If the vector dimension is zero return early.
if ( bli_zero_dim1( n ) ) return;
if ( incx == 1 && incy == 1 )
{
#if 0
PRAGMA_SIMD
for (i = 0; i < n; ++i)
{
y[i] = x[i];
}
#endif
#if 0
memcpy(y, x, n << 3);
#endif
#if 1
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
// the copy operation will be done for the multiples of 64
for ( i = 0; i < (n & (~0x3F)); i += 64 )
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
y += num_elem_per_reg * 16;
x += num_elem_per_reg * 16;
}
for ( ; i < (n & (~0x1F)); i += 32 )
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
y += num_elem_per_reg * 8;
x += num_elem_per_reg * 8;
}
for ( ; i < (n & (~0xF)); i += 16 )
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
y += num_elem_per_reg * 4;
x += num_elem_per_reg * 4;
}
for ( ; i < (n & (~0x07)); i += 8 )
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
y += num_elem_per_reg * 2;
x += num_elem_per_reg * 2;
}
for ( ; i < (n & (~0x03)); i += 4 )
{
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
y += num_elem_per_reg;
x += num_elem_per_reg;
}
for ( ; i < n; ++i )
{
*y++ = *x++;
}
#endif
}
else
{
for ( i = 0; i < n; ++i )
{
*y = *x;
x += incx;
y += incy;
}
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -73,11 +73,11 @@ void bli_sdotv_zen_int10
float* restrict x0;
float* restrict y0;
float rho0;
float rho0 = 0.0;
__m256 xv[10];
__m256 yv[10];
v8sf_t rhov[2];
v8sf_t rhov[10];
// If the vector dimension is zero, or if alpha is zero, return early.
if ( bli_zero_dim1( n ) )
@@ -96,8 +96,16 @@ void bli_sdotv_zen_int10
{
rhov[0].v = _mm256_setzero_ps();
rhov[1].v = _mm256_setzero_ps();
rhov[2].v = _mm256_setzero_ps();
rhov[3].v = _mm256_setzero_ps();
rhov[4].v = _mm256_setzero_ps();
rhov[5].v = _mm256_setzero_ps();
rhov[6].v = _mm256_setzero_ps();
rhov[7].v = _mm256_setzero_ps();
rhov[8].v = _mm256_setzero_ps();
rhov[9].v = _mm256_setzero_ps();
for ( i = 0; (i + 79) < n; i += 80 )
for ( i = 0 ; (i + 79) < n; i += 80 )
{
// 80 elements will be processed per loop; 10 FMAs will run per loop.
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -124,19 +132,25 @@ void bli_sdotv_zen_int10
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[1].v );
rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
rhov[5].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[5].v );
rhov[6].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[6].v );
rhov[7].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[7].v );
rhov[8].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[8].v );
rhov[9].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[9].v );
x0 += 10*n_elem_per_reg;
y0 += 10*n_elem_per_reg;
}
rhov[0].v += rhov[5].v;
rhov[1].v += rhov[6].v;
rhov[2].v += rhov[7].v;
rhov[3].v += rhov[8].v;
rhov[4].v += rhov[9].v;
for ( ; (i + 39) < n; i += 40 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -153,34 +167,17 @@ void bli_sdotv_zen_int10
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
x0 += 5*n_elem_per_reg;
y0 += 5*n_elem_per_reg;
}
for ( ; (i + 31) < n; i += 32 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
x0 += 4*n_elem_per_reg;
y0 += 4*n_elem_per_reg;
}
rhov[0].v += rhov[2].v;
rhov[1].v += rhov[3].v;
rhov[0].v += rhov[4].v;
for ( ; (i + 15) < n; i += 16 )
{
@@ -197,6 +194,8 @@ void bli_sdotv_zen_int10
y0 += 2*n_elem_per_reg;
}
rhov[0].v += rhov[1].v;
for ( ; (i + 7) < n; i += 8 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
@@ -211,19 +210,15 @@ void bli_sdotv_zen_int10
for ( ; (i + 0) < n; i += 1 )
{
rhov[0].f[0] += x0[i] * y0[i];
rho0 += (*x0) * (*y0);
x0 += 1;
y0 += 1;
}
v8sf_t onev;
onev.v = _mm256_set1_ps( 1.0f );
rhov[0].v = _mm256_dp_ps( rhov[0].v, onev.v, 0xf1 );
rhov[1].v = _mm256_dp_ps( rhov[1].v, onev.v, 0xf1 );
// Manually add the results from above to finish the sum.
rho0 += rhov[0].f[0] + rhov[0].f[4];
rho0 += rhov[1].f[0] + rhov[1].f[4];
rho0 += rhov[0].f[0] + rhov[0].f[1] +
rhov[0].f[2] + rhov[0].f[3] +
rhov[0].f[4] + rhov[0].f[5] +
rhov[0].f[6] + rhov[0].f[7];
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
@@ -269,11 +264,11 @@ void bli_ddotv_zen_int10
double* restrict x0;
double* restrict y0;
double rho0;
double rho0 = 0.0;
__m256d xv[10];
__m256d yv[10];
v4df_t rhov[2];
v4df_t rhov[10];
// If the vector dimension is zero, or if alpha is zero, return early.
if ( bli_zero_dim1( n ) )
@@ -292,6 +287,14 @@ void bli_ddotv_zen_int10
{
rhov[0].v = _mm256_setzero_pd();
rhov[1].v = _mm256_setzero_pd();
rhov[2].v = _mm256_setzero_pd();
rhov[3].v = _mm256_setzero_pd();
rhov[4].v = _mm256_setzero_pd();
rhov[5].v = _mm256_setzero_pd();
rhov[6].v = _mm256_setzero_pd();
rhov[7].v = _mm256_setzero_pd();
rhov[8].v = _mm256_setzero_pd();
rhov[9].v = _mm256_setzero_pd();
for ( i = 0; (i + 39) < n; i += 40 )
{
@@ -320,19 +323,25 @@ void bli_ddotv_zen_int10
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[1].v );
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
rhov[5].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[5].v );
rhov[6].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[6].v );
rhov[7].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[7].v );
rhov[8].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[8].v );
rhov[9].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[9].v );
x0 += 10*n_elem_per_reg;
y0 += 10*n_elem_per_reg;
}
rhov[0].v += rhov[5].v;
rhov[1].v += rhov[6].v;
rhov[2].v += rhov[7].v;
rhov[3].v += rhov[8].v;
rhov[4].v += rhov[9].v;
for ( ; (i + 19) < n; i += 20 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -349,14 +358,16 @@ void bli_ddotv_zen_int10
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
x0 += 5*n_elem_per_reg;
y0 += 5*n_elem_per_reg;
}
rhov[0].v += rhov[4].v;
for ( ; (i + 15) < n; i += 16 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -371,13 +382,16 @@ void bli_ddotv_zen_int10
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
x0 += 4*n_elem_per_reg;
y0 += 4*n_elem_per_reg;
}
rhov[0].v += rhov[2].v;
rhov[1].v += rhov[3].v;
for ( ; (i + 7) < n; i += 8 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -393,6 +407,8 @@ void bli_ddotv_zen_int10
y0 += 2*n_elem_per_reg;
}
rhov[0].v += rhov[1].v;
for ( ; (i + 3) < n; i += 4 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
@@ -407,12 +423,14 @@ void bli_ddotv_zen_int10
for ( ; (i + 0) < n; i += 1 )
{
rhov[0].d[0] += x0[i] * y0[i];
rho0 += (*x0) * (*y0);
x0 += 1;
y0 += 1;
}
// Manually add the results from above to finish the sum.
rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
rho0 += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3];
rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when

View File

@@ -80,9 +80,18 @@ void bli_sscalv_zen_int10
// If alpha is zero, use setv.
if ( PASTEMAC(s,eq0)( *alpha ) )
{
float* zero = bli_s0;
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
float* zero = bli_s0;
#ifdef BLIS_CONFIG_ZEN2
bli_ssetv_zen_int
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx
);
#else
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
f
(
BLIS_NO_CONJUGATE,
@@ -91,6 +100,7 @@ void bli_sscalv_zen_int10
x, incx,
cntx
);
#endif
return;
}
@@ -270,8 +280,18 @@ void bli_dscalv_zen_int10
// If alpha is zero, use setv.
if ( PASTEMAC(d,eq0)( *alpha ) )
{
double* zero = bli_d0;
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
double* zero = bli_d0;
#ifdef BLIS_CONFIG_ZEN2
bli_dsetv_zen_int
(
BLIS_NO_CONJUGATE,
n,
zero,
x, incx,
cntx
);
#else
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
f
(
@@ -281,6 +301,7 @@ void bli_dscalv_zen_int10
x, incx,
cntx
);
#endif
return;
}

View File

@@ -0,0 +1,228 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
// -----------------------------------------------------------------------------
void bli_ssetv_zen_int
(
conj_t conjalpha,
dim_t n,
float* restrict alpha,
float* restrict x, inc_t incx,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 8;
dim_t i = 0;
__m256 alphav;
// If the vector dimension is zero return early.
if ( bli_zero_dim1( n ) ) return;
if ( incx == 1 )
{
alphav = _mm256_broadcast_ss( alpha );
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
// for example if n = 255
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
for ( i = 0; i < (n & (~0x7F)); i += 128 )
{
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 8, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 9, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 10, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 11, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 12, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 13, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 14, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 15, alphav);
x += 128;
}
for ( ; i < (n & (~0x3F)); i += 64 )
{
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
x += 64;
}
for ( ; i < (n & (~0x1F)); i += 32 )
{
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
x += 32;
}
for ( ; i < (n & (~0x0F)); i += 16 )
{
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
x += 16;
}
for ( ; i < (n & (~0x07)); i += 8 )
{
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
x += 8;
}
for ( ; i < n; ++i )
{
*x++ = *alpha;
}
}
else
{
for ( dim_t i = 0; i < n; ++i )
{
*x = *alpha;
x += incx;
}
}
}
void bli_dsetv_zen_int
(
conj_t conjalpha,
dim_t n,
double* restrict alpha,
double* restrict x, inc_t incx,
cntx_t* restrict cntx
)
{
const dim_t num_elem_per_reg = 4;
dim_t i = 0;
__m256d alphav;
// If the vector dimension is zero return early.
if ( bli_zero_dim1( n ) ) return;
if ( incx == 1 )
{
// Broadcast the alpha scalar to all elements of a vector register.
alphav = _mm256_broadcast_sd( alpha );
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
// the copy operation will be done for the multiples of 64
for ( i = 0; i < (n & (~0x3F)); i += 64 )
{
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 8, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 9, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 10, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 11, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 12, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 13, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 14, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 15, alphav);
x += num_elem_per_reg * 16;
}
for ( ; i < (n & (~0x1F)); i += 32 )
{
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
x += num_elem_per_reg * 8;
}
for ( ; i < (n & (~0xF)); i += 16 )
{
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
x += num_elem_per_reg * 4;
}
for ( ; i < (n & (~0x07)); i += 8 )
{
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
x += num_elem_per_reg * 2;
}
for ( ; i < (n & (~0x03)); i += 4 )
{
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
x += num_elem_per_reg;
}
for ( ; i < n; ++i )
{
*x++ = *alpha;
}
}
else
{
for ( i = 0; i < n; ++i )
{
*x = *alpha;
x += incx;
}
}
}

View File

@@ -0,0 +1,344 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
/* Union data structure to access AVX registers
One 256-bit AVX register holds 8 SP elements. */
typedef union
{
__m256 v;
float f[8] __attribute__((aligned(64)));
} v8sf_t;
/* Union data structure to access AVX registers
* One 256-bit AVX register holds 4 DP elements. */
typedef union
{
__m256d v;
double d[4] __attribute__((aligned(64)));
} v4df_t;
// -----------------------------------------------------------------------------
void bli_sswapv_zen_int8
(
dim_t n,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t n_elem_per_reg = 8;
dim_t i = 0;
float* restrict x0;
float* restrict y0;
__m256 xv[8];
__m256 yv[8];
// If the vector dimension is zero, return early.
if ( bli_zero_dim1( n ) ) return;
x0 = x;
y0 = y;
if ( incx == 1 && incy == 1 )
{
for ( i = 0; ( i + 63 ) < n; i += 64 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]);
_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]);
_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]);
_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]);
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]);
_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]);
_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]);
_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]);
x0 += 8*n_elem_per_reg;
y0 += 8*n_elem_per_reg;
}
for ( ; ( i + 31 ) < n; i += 32 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
x0 += 4*n_elem_per_reg;
y0 += 4*n_elem_per_reg;
}
for ( ; ( i + 15 ) < n; i += 16 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
x0 += 2*n_elem_per_reg;
y0 += 2*n_elem_per_reg;
}
for ( ; ( i + 7 ) < n; i += 8 )
{
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
x0 += 1*n_elem_per_reg;
y0 += 1*n_elem_per_reg;
}
for ( ; (i + 0) < n; i += 1 )
{
PASTEMAC(s,swaps)( x[i], y[i] );
}
}
else
{
for ( i = 0; i < n; ++i )
{
PASTEMAC(s,swaps)( (*x0), (*y0) );
x0 += incx;
y0 += incy;
}
}
}
//--------------------------------------------------------------------------------
void bli_dswapv_zen_int8
(
dim_t n,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t n_elem_per_reg = 4;
dim_t i = 0;
double* restrict x0;
double* restrict y0;
__m256d xv[8];
__m256d yv[8];
// If the vector dimension is zero, return early.
if ( bli_zero_dim1( n ) ) return;
x0 = x;
y0 = y;
if ( incx == 1 && incy == 1 )
{
for ( ; ( i + 31 ) < n; i += 32 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]);
_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]);
_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]);
_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]);
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]);
_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]);
_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]);
_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]);
x0 += 8*n_elem_per_reg;
y0 += 8*n_elem_per_reg;
}
for ( ; ( i + 15 ) < n; i += 16 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
x0 += 4*n_elem_per_reg;
y0 += 4*n_elem_per_reg;
}
for ( ; ( i + 7 ) < n; i += 8 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
x0 += 2*n_elem_per_reg;
y0 += 2*n_elem_per_reg;
}
for ( ; ( i + 3 ) < n; i += 4 )
{
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
x0 += 1*n_elem_per_reg;
y0 += 1*n_elem_per_reg;
}
for ( ; (i + 0) < n; i += 1 )
{
PASTEMAC(d,swaps)( x[i], y[i] );
}
}
else
{
for ( i = 0; i < n; ++i )
{
PASTEMAC(d,swaps)( (*x0), (*y0) );
x0 += incx;
y0 += incy;
}
}
}

View File

@@ -4,8 +4,8 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,8 +4,8 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,6 +33,13 @@
*/
// -- level-1m --
PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
PACKM_KER_PROT(double, d, packm_8xk_nn_zen)
PACKM_KER_PROT(double, d, packm_6xk_nn_zen)
// -- level-1v --
// amaxv (intrinsics)
@@ -42,17 +50,17 @@ AMAXV_KER_PROT( double, d, amaxv_zen_int )
AXPYV_KER_PROT( float, s, axpyv_zen_int )
AXPYV_KER_PROT( double, d, axpyv_zen_int )
// axpyv (intrinsics unrolled x10)
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
// axpyv (intrinsics unrolled x10)
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
// dotv (intrinsics)
DOTV_KER_PROT( float, s, dotv_zen_int )
DOTV_KER_PROT( double, d, dotv_zen_int )
// dotv (intrinsics, unrolled x10)
DOTV_KER_PROT( float, s, dotv_zen_int10 )
DOTV_KER_PROT( double, d, dotv_zen_int10 )
// dotv (intrinsics, unrolled x10)
DOTV_KER_PROT( float, s, dotv_zen_int10 )
DOTV_KER_PROT( double, d, dotv_zen_int10 )
// dotxv (intrinsics)
DOTXV_KER_PROT( float, s, dotxv_zen_int )
@@ -62,9 +70,21 @@ DOTXV_KER_PROT( double, d, dotxv_zen_int )
SCALV_KER_PROT( float, s, scalv_zen_int )
SCALV_KER_PROT( double, d, scalv_zen_int )
// scalv (intrinsics unrolled x10)
SCALV_KER_PROT( float, s, scalv_zen_int10 )
SCALV_KER_PROT( double, d, scalv_zen_int10 )
// scalv (intrinsics unrolled x10)
SCALV_KER_PROT( float, s, scalv_zen_int10 )
SCALV_KER_PROT( double, d, scalv_zen_int10 )
// swapv (intrinsics)
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
SWAPV_KER_PROT(double, d, swapv_zen_int8 )
// copyv (intrinsics)
COPYV_KER_PROT( float, s, copyv_zen_int )
COPYV_KER_PROT( double, d, copyv_zen_int )
//
SETV_KER_PROT(float, s, setv_zen_int)
SETV_KER_PROT(double, d, setv_zen_int)
// -- level-1f --
@@ -76,3 +96,106 @@ AXPYF_KER_PROT( double, d, axpyf_zen_int_8 )
DOTXF_KER_PROT( float, s, dotxf_zen_int_8 )
DOTXF_KER_PROT( double, d, dotxf_zen_int_8 )
// -- level-3 sup --------------------------------------------------------------
// semmsup_rv
//GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 )
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 )
// gemmsup_rv (mkernel in m dim)
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m )
// gemmsup_rv (mkernel in n dim)
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n )
// gemmsup_rd
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n)
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n)
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 )
// gemmsup_rv (mkernel in n dim)
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 )

View File

@@ -1,4 +0,0 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore

View File

@@ -0,0 +1,599 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "immintrin.h"
#include "blis.h"
/* Union data structure to access AVX registers
One 256-bit AVX register holds 8 SP elements. */
typedef union
{
__m256 v;
float f[8] __attribute__((aligned(64)));
} v8sf_t;
/* Union data structure to access AVX registers
* One 256-bit AVX register holds 4 DP elements. */
typedef union
{
__m256d v;
double d[4] __attribute__((aligned(64)));
} v4df_t;
void bli_saxpyf_zen_int_5
(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
float* restrict alpha,
float* restrict a, inc_t inca, inc_t lda,
float* restrict x, inc_t incx,
float* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t fuse_fac = 5;
const dim_t n_elem_per_reg = 8;
const dim_t n_iter_unroll = 2;
dim_t i;
float* restrict a0;
float* restrict a1;
float* restrict a2;
float* restrict a3;
float* restrict a4;
float* restrict y0;
v8sf_t chi0v, chi1v, chi2v, chi3v;
v8sf_t chi4v;
v8sf_t a00v, a01v, a02v, a03v;
v8sf_t a04v;
v8sf_t a10v, a11v, a12v, a13v;
v8sf_t a14v;
v8sf_t y0v, y1v;
float chi0, chi1, chi2, chi3;
float chi4;
// If either dimension is zero, or if alpha is zero, return early.
if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
// If b_n is not equal to the fusing factor, then perform the entire
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_ZEN2
for ( i = 0; i < b_n; ++i )
{
float* a1 = a + (0 )*inca + (i )*lda;
float* chi1 = x + (i )*incx;
float* y1 = y + (0 )*incy;
float alpha_chi1;
bli_scopycjs( conjx, *chi1, alpha_chi1 );
bli_sscals( *alpha, alpha_chi1 );
bli_saxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
{
float* a1 = a + (0 )*inca + (i )*lda;
float* chi1 = x + (i )*incx;
float* y1 = y + (0 )*incy;
float alpha_chi1;
bli_scopycjs( conjx, *chi1, alpha_chi1 );
bli_sscals( *alpha, alpha_chi1 );
f
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#endif
return;
}
// At this point, we know that b_n is exactly equal to the fusing factor.
a0 = a + 0*lda;
a1 = a + 1*lda;
a2 = a + 2*lda;
a3 = a + 3*lda;
a4 = a + 4*lda;
y0 = y;
chi0 = *( x + 0*incx );
chi1 = *( x + 1*incx );
chi2 = *( x + 2*incx );
chi3 = *( x + 3*incx );
chi4 = *( x + 4*incx );
// Scale each chi scalar by alpha.
bli_sscals( *alpha, chi0 );
bli_sscals( *alpha, chi1 );
bli_sscals( *alpha, chi2 );
bli_sscals( *alpha, chi3 );
bli_sscals( *alpha, chi4 );
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
chi0v.v = _mm256_broadcast_ss( &chi0 );
chi1v.v = _mm256_broadcast_ss( &chi1 );
chi2v.v = _mm256_broadcast_ss( &chi2 );
chi3v.v = _mm256_broadcast_ss( &chi3 );
chi4v.v = _mm256_broadcast_ss( &chi4 );
// If there are vectorized iterations, perform them with vector
// instructions.
if ( inca == 1 && incy == 1 )
{
for ( i = 0; (i + 15) < m; i += 16 )
{
// Load the input values.
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
// Store the output.
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
y0 += n_iter_unroll * n_elem_per_reg;
a0 += n_iter_unroll * n_elem_per_reg;
a1 += n_iter_unroll * n_elem_per_reg;
a2 += n_iter_unroll * n_elem_per_reg;
a3 += n_iter_unroll * n_elem_per_reg;
a4 += n_iter_unroll * n_elem_per_reg;
}
for( ; (i + 7) < m; i += 8 )
{
// Load the input values.
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
// Store the output.
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
y0 += n_elem_per_reg;
a0 += n_elem_per_reg;
a1 += n_elem_per_reg;
a2 += n_elem_per_reg;
a3 += n_elem_per_reg;
a4 += n_elem_per_reg;
}
// If there are leftover iterations, perform them with scalar code.
for ( ; (i + 0) < m ; ++i )
{
double y0c = *y0;
const float a0c = *a0;
const float a1c = *a1;
const float a2c = *a2;
const float a3c = *a3;
const float a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += 1;
a1 += 1;
a2 += 1;
a3 += 1;
a4 += 1;
y0 += 1;
}
}
else
{
for ( i = 0; (i + 0) < m ; ++i )
{
double y0c = *y0;
const float a0c = *a0;
const float a1c = *a1;
const float a2c = *a2;
const float a3c = *a3;
const float a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += inca;
a1 += inca;
a2 += inca;
a3 += inca;
a4 += inca;
y0 += incy;
}
}
}
// -----------------------------------------------------------------------------
void bli_daxpyf_zen_int_5
(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
double* restrict alpha,
double* restrict a, inc_t inca, inc_t lda,
double* restrict x, inc_t incx,
double* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
const dim_t fuse_fac = 5;
const dim_t n_elem_per_reg = 4;
const dim_t n_iter_unroll = 2;
dim_t i;
double* restrict a0;
double* restrict a1;
double* restrict a2;
double* restrict a3;
double* restrict a4;
double* restrict y0;
v4df_t chi0v, chi1v, chi2v, chi3v;
v4df_t chi4v;
v4df_t a00v, a01v, a02v, a03v;
v4df_t a04v;
v4df_t a10v, a11v, a12v, a13v;
v4df_t a14v;
v4df_t y0v, y1v;
double chi0, chi1, chi2, chi3;
double chi4;
// If either dimension is zero, or if alpha is zero, return early.
if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
// If b_n is not equal to the fusing factor, then perform the entire
// operation as a loop over axpyv.
if ( b_n != fuse_fac )
{
#ifdef BLIS_CONFIG_ZEN2
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
bli_dscals( *alpha, alpha_chi1 );
bli_daxpyv_zen_int10
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#else
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
for ( i = 0; i < b_n; ++i )
{
double* a1 = a + (0 )*inca + (i )*lda;
double* chi1 = x + (i )*incx;
double* y1 = y + (0 )*incy;
double alpha_chi1;
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
bli_dscals( *alpha, alpha_chi1 );
f
(
conja,
m,
&alpha_chi1,
a1, inca,
y1, incy,
cntx
);
}
#endif
return;
}
// At this point, we know that b_n is exactly equal to the fusing factor.
a0 = a + 0*lda;
a1 = a + 1*lda;
a2 = a + 2*lda;
a3 = a + 3*lda;
a4 = a + 4*lda;
y0 = y;
chi0 = *( x + 0*incx );
chi1 = *( x + 1*incx );
chi2 = *( x + 2*incx );
chi3 = *( x + 3*incx );
chi4 = *( x + 4*incx );
// Scale each chi scalar by alpha.
bli_dscals( *alpha, chi0 );
bli_dscals( *alpha, chi1 );
bli_dscals( *alpha, chi2 );
bli_dscals( *alpha, chi3 );
bli_dscals( *alpha, chi4 );
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
chi0v.v = _mm256_broadcast_sd( &chi0 );
chi1v.v = _mm256_broadcast_sd( &chi1 );
chi2v.v = _mm256_broadcast_sd( &chi2 );
chi3v.v = _mm256_broadcast_sd( &chi3 );
chi4v.v = _mm256_broadcast_sd( &chi4 );
// If there are vectorized iterations, perform them with vector
// instructions.
if ( inca == 1 && incy == 1 )
{
for ( i = 0; (i + 7) < m; i += 8 )
{
// Load the input values.
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
// Store the output.
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
y0 += n_iter_unroll * n_elem_per_reg;
a0 += n_iter_unroll * n_elem_per_reg;
a1 += n_iter_unroll * n_elem_per_reg;
a2 += n_iter_unroll * n_elem_per_reg;
a3 += n_iter_unroll * n_elem_per_reg;
a4 += n_iter_unroll * n_elem_per_reg;
}
for( ; (i + 3) < m; i += 4 )
{
// Load the input values.
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
// perform : y += alpha * x;
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
// Store the output.
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
y0 += n_elem_per_reg;
a0 += n_elem_per_reg;
a1 += n_elem_per_reg;
a2 += n_elem_per_reg;
a3 += n_elem_per_reg;
a4 += n_elem_per_reg;
}
// If there are leftover iterations, perform them with scalar code.
for ( ; (i + 0) < m ; ++i )
{
double y0c = *y0;
const double a0c = *a0;
const double a1c = *a1;
const double a2c = *a2;
const double a3c = *a3;
const double a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += 1;
a1 += 1;
a2 += 1;
a3 += 1;
a4 += 1;
y0 += 1;
}
}
else
{
for ( i = 0; (i + 0) < m ; ++i )
{
double y0c = *y0;
const double a0c = *a0;
const double a1c = *a1;
const double a2c = *a2;
const double a3c = *a3;
const double a4c = *a4;
y0c += chi0 * a0c;
y0c += chi1 * a1c;
y0c += chi2 * a2c;
y0c += chi3 * a3c;
y0c += chi4 * a4c;
*y0 = y0c;
a0 += inca;
a1 += inca;
a2 += inca;
a3 += inca;
a4 += inca;
y0 += incy;
}
}
}

View File

@@ -0,0 +1,40 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- level-1f --
AXPYF_KER_PROT( float, s, axpyf_zen_int_5 )
AXPYF_KER_PROT( double, d, axpyf_zen_int_5 )

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -463,7 +463,8 @@ void GENBARNAME(cntx_init)
// operation.
// Set the gemm slot to the default gemm sup handler.
vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref;
vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref;
vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
// -- Set level-3 small/unpacked micro-kernels and preferences -------------

View File

@@ -5,7 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
# Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -97,16 +97,11 @@ endif
BLAS_LIB_PATH := $(HOME)/flame/lib
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
# OpenBLAS
OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a
# ATLAS
ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \
$(BLAS_LIB_PATH)/libatlas.a
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_intel_lp64 \
@@ -114,18 +109,6 @@ MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_sequential \
-lpthread -lm -ldl
# ESSL
# Note: ESSL is named differently for SMP and/or BG
#ESSL_TYPE := # This is the 32b library on POWER
#ESSL_TYPE := 6464 # This is the 64b library on POWER
#ESSL_TYPE := bg # This is the 32b single-threaded library on Blue Gene
#ESSL_TYPE := smpbg # This is the 32b multi-threaded library on Blue Gene
#ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
# Accelerate
MAC_LIB := -framework Accelerate
#
# --- General build definitions ------------------------------------------------
@@ -159,121 +142,32 @@ CFLAGS += -I$(TEST_SRC_PATH)
# --- Targets/rules ------------------------------------------------------------
#
# Complete list of possible targets when defining 'all':
#
# blis openblas atlas mkl mac essl
#
#all: blis openblas atlas mkl
# Define the operations we will test.
TEST_OPS := dotv axpyv \
gemv ger hemv her her2 trmv trsv \
gemm hemm herk her2k trmm trsm
# Optionally test gemmt, which some libraries might not implement.
ifeq ($(BUILD_GEMMT),yes)
TEST_OPS := $(TEST_OPS) gemmt
endif
# Define a function to create the executable names.
test-bins = $(foreach op, $(TEST_OPS), test_$(op)_$(1).x)
# Create the list of executables for each implementation.
TEST_BINS_BLIS := $(call test-bins,blis)
TEST_BINS_OPENBLAS := $(call test-bins,openblas)
TEST_BINS_MKL := $(call test-bins,mkl)
all: blis openblas mkl
blis: check-env \
test_dotv_blis.x \
test_axpyv_blis.x \
test_gemv_blis.x \
test_ger_blis.x \
test_hemv_blis.x \
test_her_blis.x \
test_her2_blis.x \
test_trmv_blis.x \
test_trsv_blis.x \
\
test_gemm_blis.x \
test_hemm_blis.x \
test_herk_blis.x \
test_her2k_blis.x \
test_trmm_blis.x \
test_trsm_blis.x
blis: check-env $(TEST_BINS_BLIS)
openblas: check-env \
test_dotv_openblas.x \
test_axpyv_openblas.x \
test_gemv_openblas.x \
test_ger_openblas.x \
test_hemv_openblas.x \
test_her_openblas.x \
test_her2_openblas.x \
test_trmv_openblas.x \
test_trsv_openblas.x \
\
test_gemm_openblas.x \
test_hemm_openblas.x \
test_herk_openblas.x \
test_her2k_openblas.x \
test_trmm_openblas.x \
test_trsm_openblas.x
atlas: check-env \
test_dotv_atlas.x \
test_axpyv_atlas.x \
test_gemv_atlas.x \
test_ger_atlas.x \
test_hemv_atlas.x \
test_her_atlas.x \
test_her2_atlas.x \
test_trmv_atlas.x \
test_trsv_atlas.x \
\
test_gemm_atlas.x \
test_hemm_atlas.x \
test_herk_atlas.x \
test_her2k_atlas.x \
test_trmm_atlas.x \
test_trsm_atlas.x
mkl: check-env \
test_dotv_mkl.x \
test_axpyv_mkl.x \
test_gemv_mkl.x \
test_ger_mkl.x \
test_hemv_mkl.x \
test_her_mkl.x \
test_her2_mkl.x \
test_trmv_mkl.x \
test_trsv_mkl.x \
\
test_gemm_mkl.x \
test_hemm_mkl.x \
test_herk_mkl.x \
test_her2k_mkl.x \
test_trmm_mkl.x \
test_trsm_mkl.x
essl: check-env \
test_dotv_essl.x \
test_axpyv_essl.x \
test_gemv_essl.x \
test_ger_essl.x \
test_hemv_essl.x \
test_her_essl.x \
test_her2_essl.x \
test_trmv_essl.x \
test_trsv_essl.x \
\
test_gemm_essl.x \
test_hemm_essl.x \
test_herk_essl.x \
test_her2k_essl.x \
test_trmm_essl.x \
test_trsm_essl.x
mac: check-env \
test_dotv_mac.x \
test_axpyv_mac.x \
test_gemv_mac.x \
test_ger_mac.x \
test_hemv_mac.x \
test_her_mac.x \
test_her2_mac.x \
test_trmv_mac.x \
test_trsv_mac.x \
\
test_gemm_mac.x \
test_hemm_mac.x \
test_herk_mac.x \
test_her2k_mac.x \
test_trmm_mac.x \
test_trsm_mac.x
openblas: check-env $(TEST_BINS_OPENBLAS)
mkl: check-env $(TEST_BINS_MKL)
# --Object file rules --
@@ -281,21 +175,13 @@ mac: check-env \
$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
$(CC) $(CFLAGS) -c $< -o $@
test_%_openblas.o: test_%.c
$(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@
test_%_atlas.o: test_%.c
$(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@
test_%_mkl.o: test_%.c
$(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@
test_%_essl.o: test_%.c
$(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@
test_%_mac.o: test_%.c
$(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@
test_%_blis.o: test_%.c
$(CC) $(CFLAGS) -DBLIS -c $< -o $@
@@ -310,18 +196,9 @@ test_%_blis.o: test_%.c
test_%_openblas.x: test_%_openblas.o $(LIBBLIS_LINK)
$(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_atlas.x: test_%_atlas.o $(LIBBLIS_LINK)
$(LINKER) $< $(ATLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_mkl.x: test_%_mkl.o $(LIBBLIS_LINK)
$(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_essl.x: test_%_essl.o $(LIBBLIS_LINK)
$(LINKER) $< $(ESSL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_mac.x: test_%_mac.o $(LIBBLIS_LINK)
$(LINKER) $< $(MAC_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK)
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@

218
test/other/test_copyv.c Normal file
View File

@@ -0,0 +1,218 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
//#define BLIS_ACCURACY_TEST
#ifdef BLIS_ACCURACY_TEST
bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
for (int i = 0; i < n; i++) {
if ((*x) != (*y)) {
printf("%4f != %4f at location %d\n", *x, *y, i);
return FALSE;
}
x += incx;
y += incy;
}
return TRUE;
}
bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
for (int i = 0; i < n; i++) {
if ((*x) != (*y)) {
printf("%4f != %4f at location %d\n", *x, *y, i);
return FALSE;
}
x += incx;
y += incy;
}
return TRUE;
}
#endif
int main(int argc, char** argv)
{
obj_t x, y;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input, sizeof_dt;
int r, n_repeats;
num_t dt;
double dtime;
double dtime_save;
double Gbps;
//bli_init();
n_repeats = 100000;
#ifndef PRINT
p_begin = 200;
p_end = 100000;
p_inc = 200;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = 16;
#endif
#if 1
// dt = BLIS_FLOAT;
dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
if (dt == BLIS_DOUBLE)
sizeof_dt = sizeof(double);
else if (dt == BLIS_FLOAT)
sizeof_dt = sizeof(float);
printf("executable\t n\t GBs per sec\n");
for (p = p_begin; p <= p_end; p += p_inc)
{
if (n_input < 0) n = p * (dim_t)abs(n_input);
else n = (dim_t)n_input;
bli_obj_create(dt, n, 1, 0, 0, &x);
bli_obj_create(dt, n, 1, 0, 0, &y);
bli_randm(&x);
dtime_save = DBL_MAX;
for (r = 0; r < n_repeats; ++r)
{
dtime = bli_clock();
#ifdef BLIS
bli_copyv(&x,
&y
);
#else
if (bli_is_float(dt))
{
f77_int nn = bli_obj_length(&x);
f77_int incx = bli_obj_vector_inc(&x);
float* xp = bli_obj_buffer(&x);
f77_int incy = bli_obj_vector_inc(&y);
float* yp = bli_obj_buffer(&y);
scopy_(&nn,
xp, &incx,
yp, &incy);
}
else if (bli_is_double(dt))
{
f77_int nn = bli_obj_length(&x);
f77_int incx = bli_obj_vector_inc(&x);
double* xp = bli_obj_buffer(&x);
f77_int incy = bli_obj_vector_inc(&y);
double* yp = bli_obj_buffer(&y);
dcopy_(&nn,
xp, &incx,
yp, &incy
);
}
#endif
dtime_save = bli_clock_min_diff(dtime_save, dtime);
#ifdef BLIS_ACCURACY_TEST
if (dt == BLIS_FLOAT) {
int nn = bli_obj_length(&x);
int incx = bli_obj_vector_inc(&x);
float* xp = bli_obj_buffer(&x);
int incy = bli_obj_vector_inc(&y);
float* yp = bli_obj_buffer(&y);
if (scompare_result(nn, xp, incx, yp, incy))
printf("Copy Successful\n");
else
printf("ALERT!!! Copy Failed\n");
}
if (dt == BLIS_DOUBLE) {
int nn = bli_obj_length(&x);
int incx = bli_obj_vector_inc(&x);
double* xp = bli_obj_buffer(&x);
int incy = bli_obj_vector_inc(&y);
double* yp = bli_obj_buffer(&y);
if (dcompare_result(nn, xp, incx, yp, incy))
printf("Copy Successful\n");
else
printf("ALERT!!! Copy Failed\n");
}
#endif
}
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
if (p >= 1000)
p_inc = 1000;
if (p >= 10000)
p_inc = 10000;
Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
#ifdef BLIS
printf("data_copyv_blis\t");
#else
printf("data_copyv_%s\t", BLAS);
#endif
printf("%4lu\t %7.2f\n",
(unsigned long)n, Gbps);
bli_obj_free(&x);
bli_obj_free(&y);
}
// bli_finalize();
return 0;
}

392
test/other/test_gemm.c Normal file
View File

@@ -0,0 +1,392 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
//#define FILE_IN_OUT
//#define PRINT
//#define MATRIX_INITIALISATION
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, n_input, k_input;
num_t dt;
int r, n_repeats;
trans_t transa;
trans_t transb;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
#ifdef FILE_IN_OUT
FILE* fin = NULL;
FILE* fout = NULL;
char gemm = 's';
#endif
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
#ifndef PRINT
p_begin = 200;
p_end = 2000;
p_inc = 200;
m_input = -1;
n_input = -1;
k_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
m_input = 5;
k_input = 6;
n_input = 4;
#endif
#if 1
//dt = BLIS_FLOAT;
dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
#ifdef FILE_IN_OUT
if (argc < 3)
{
printf("Usage: ./test_gemm_XX.x input.csv output.csv\n");
exit(1);
}
fin = fopen(argv[1], "r");
if (fin == NULL)
{
printf("Error opening the file %s\n", argv[1]);
exit(1);
}
fout = fopen(argv[2], "w");
if (fout == NULL)
{
printf("Error opening output file %s\n", argv[2]);
exit(1);
}
fprintf(fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n");
printf("~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n");
inc_t cs_a;
inc_t cs_b;
inc_t cs_c;
while (fscanf(fin, "%lld %lld %lld %lld %lld %lld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6)
{
if ((m > cs_a) || (k > cs_b) || (m > cs_c)) continue; // leading dimension should be greater than number of rows
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
bli_obj_create( dt, 1, 1, 0, 0, &beta );
bli_obj_create( dt, m, k, 1, cs_a, &a );
bli_obj_create( dt, k, n, 1, cs_b, &b );
bli_obj_create( dt, m, n, 1, cs_c, &c );
bli_obj_create( dt, m, n, 1, cs_c, &c_save );
#ifdef MATRIX_INITIALISATION
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
#endif
bli_obj_set_conjtrans( transa, &a);
bli_obj_set_conjtrans( transb, &b);
//bli_setsc( 0.0, -1, &alpha );
//bli_setsc( 0.0, 1, &beta );
bli_setsc( -1, 0.0, &alpha );
bli_setsc( 1, 0.0, &beta );
#else
for ( p = p_begin; p <= p_end; p += p_inc )
{
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (0.9/1.0), 0.2, &alpha );
bli_setsc( -(1.1/1.0), 0.3, &beta );
#endif
bli_copym( &c, &c_save );
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_gemm( &alpha,
&a,
&b,
&beta,
&c );
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_gemm_blis" );
#else
printf( "data_gemm_%s", BLAS );
#endif
#ifdef FILE_IN_OUT
if ( bli_is_double( dt ) ) {
if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES/4)) || ((m < (BLIS_SMALL_M_RECT_MATRIX_THRES/2) ) && (k < (BLIS_SMALL_K_RECT_MATRIX_THRES/2) )))
gemm = 'S'; // small gemm
else gemm = 'N'; // Normal blis gemm
}
else if (bli_is_float( dt )) {
if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) || ((m < BLIS_SMALL_M_RECT_MATRIX_THRES) && (k < BLIS_SMALL_K_RECT_MATRIX_THRES)))
gemm = 'S'; // small gemm
else gemm = 'N'; // normal blis gemm
}
printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops, gemm );
fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops, gemm );
fflush(fout);
#else
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k,
( unsigned long )n, gflops );
#endif
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
#ifdef FILE_IN_OUT
fclose(fin);
fclose(fout);
#endif
return 0;
}

154
test/other/test_scalv.c Normal file
View File

@@ -0,0 +1,154 @@
/*
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
//#define PRINT
int main(int argc, char** argv)
{
obj_t a, alpha;
dim_t n, p;
dim_t p_begin, p_end, p_inc;
int n_input;
num_t dt;
int r, n_repeats;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 100000;
#ifndef PRINT
p_begin = 200;
p_end = 100000;
p_inc = 200;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = 4;
#endif
#if 1
dt = BLIS_FLOAT;
//dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
#ifdef BLIS
printf( "data_scalv_blis\t n\t gflops\n" );
#else
printf( "data_scalv_%s\t n\t gflops\n", BLAS );
#endif
for (p = p_begin; p <= p_end; p += p_inc)
{
if (n_input < 0) n = p * (dim_t)abs(n_input);
else n = (dim_t)n_input;
bli_obj_create(dt, 1, 1, 0, 0, &alpha);
bli_obj_create(dt, 1, n, 0, 0, &a);
bli_randm(&a);
bli_setsc((2.0), 0.0, &alpha);
dtime_save = DBL_MAX;
for (r = 0; r < n_repeats; ++r)
{
dtime = bli_clock();
#ifdef BLIS
bli_scalm(&BLIS_TWO, &a);
#else
if ( bli_is_float( dt ) )
{
f77_int nn = bli_obj_length( &a );
f77_int inca = bli_obj_vector_inc( &a );
float* scalar = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
sscal_( &nn, scalar,
ap, &inca );
}
else if ( bli_is_double( dt ) )
{
f77_int nn = bli_obj_length( &a );
f77_int inca = bli_obj_vector_inc( &a );
double* scalar = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
dscal_( &nn, scalar,
ap, &inca );
}
#endif
dtime_save = bli_clock_min_diff(dtime_save, dtime);
}
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
if (p == 10000)
p_inc = 10000;
if (p == 1000)
p_inc = 1000;
gflops = n / (dtime_save * 1.0e9);
#ifdef BLIS
printf( "data_scalv_blis\t" );
#else
printf( "data_scalv_%s\t", BLAS );
#endif
printf(" %4lu\t %7.2f \n",
(unsigned long)n, gflops);
bli_obj_free(&alpha);
bli_obj_free(&a);
}
return 0;
}

185
test/other/test_swapv.c Normal file
View File

@@ -0,0 +1,185 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// n x incx y incy
//void dswap_( int*, double*, int*, double*, int* );
//#define PRINT
int main( int argc, char** argv )
{
obj_t x, y;
dim_t n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int n_input;
int r, n_repeats;
num_t dt;
double dtime;
double dtime_save;
double gflops;
bli_init();
n_repeats = 3;
#ifndef PRINT
p_begin = 40;
p_end = 8000;
p_inc = 40;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
n_input = -1;
#endif
#if 1
dt = BLIS_FLOAT;
//dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_swapv_blis" );
#else
printf( "data_swapv_%s", BLAS );
#endif
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_end; p_begin <= p; p -= p_inc )
{
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
bli_obj_create( dt, n, 1, 0, 0, &x );
bli_obj_create( dt, n, 1, 0, 0, &y );
bli_randm( &x );
bli_randm( &y );
dtime_save = 1.0e9;
for ( r = 0; r < n_repeats; ++r )
{
dtime = bli_clock();
#ifdef PRINT
bli_printm( "x", &x, "%4.1f", "" );
bli_printm( "y", &y, "%4.1f", "" );
#endif
#ifdef BLIS
bli_swapv( &x,
&y
);
#else
if ( bli_is_float( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
float* xp = bli_obj_buffer( &x );
float* yp = bli_obj_buffer( &y );
sswap_( &nn,
xp, &incx,
yp, &incy );
}
else if ( bli_is_double( dt ) )
{
f77_int nn = bli_obj_length( &x );
f77_int incx = bli_obj_vector_inc( &x );
f77_int incy = bli_obj_vector_inc( &y );
double* xp = bli_obj_buffer( &x );
double* yp = bli_obj_buffer( &y );
dswap_( &nn,
xp, &incx,
yp, &incy );
}
#endif
#ifdef PRINT
bli_printm( "X after", &x, "%4.1f", "" );
bli_printm( "Y after", &y, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( n ) / ( dtime_save * 1.0e9 );
#ifdef BLIS
printf( "data_swapv_blis" );
#else
printf( "data_swapv_%s", BLAS );
#endif
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )n, gflops );
bli_obj_free( &x );
bli_obj_free( &y );
}
bli_finalize();
return 0;
}

443
test/other/test_trsm.c Normal file
View File

@@ -0,0 +1,443 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
//#define FILE_IN_OUT
#ifdef FILE_IN_OUT
//#define READ_ALL_PARAMS_FROM_FILE
#endif
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, c;
obj_t c_save;
obj_t alpha;
dim_t m, n;
num_t dt;
int r, n_repeats;
side_t side;
uplo_t uploa;
trans_t transa;
diag_t diaga;
f77_char f77_side;
f77_char f77_uploa;
f77_char f77_transa;
f77_char f77_diaga;
double dtime;
double dtime_save;
double gflops;
#ifdef FILE_IN_OUT
FILE* fin = NULL;
FILE* fout = NULL;
#else
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, n_input;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
#ifndef PRINT
p_begin = 200;
p_end = 2000;
p_inc = 200;
m_input = -1;
n_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
m_input = 4;
n_input = 4;
#endif
#endif
n_repeats = 3;
#if 1
//dt = BLIS_FLOAT;
dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
#ifdef FILE_IN_OUT
if(argc < 3)
{
printf("Usage: ./test_trsm_XX.x input.csv output.csv\n");
exit(1);
}
fin = fopen(argv[1], "r");
if(fin == NULL)
{
printf("Error opening the file %s\n", argv[1]);
exit(1);
}
fout = fopen(argv[2], "w");
if(fout == NULL)
{
printf("Error opening the file %s\n", argv[2]);
exit(1);
}
inc_t cs_a;
inc_t cs_b;
#ifdef READ_ALL_PARAMS_FROM_FILE
char side_c, uploa_c, transa_c, diaga_c;
fprintf(fout, "side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
printf("~~~~~~~_BLAS\t side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
while(fscanf(fin, "%c %c %c %c %ld %ld %ld %ld\n", &side_c, &uploa_c, &transa_c, &diaga_c, &m, &n, &cs_a, &cs_b) == 8)
{
if( 'l' == side_c|| 'L' == side_c)
side = BLIS_LEFT;
else if('r' == side_c || 'R' == side_c)
side = BLIS_RIGHT;
else
{
printf("Invalid entry for the argument 'side':%c\n",side_c);
continue;
}
if('l' == uploa_c || 'L' == uploa_c)
uploa = BLIS_LOWER;
else if('u' == uploa_c || 'U' == uploa_c)
uploa = BLIS_UPPER;
else
{
printf("Invalid entry for the argument 'uplo':%c\n",uploa_c);
continue;
}
if('t' == transa_c || 'T' == transa_c)
transa = BLIS_TRANSPOSE;
else if('n' == transa_c || 'N' == transa_c)
transa = BLIS_NO_TRANSPOSE;
else
{
printf("Invalid entry for the argument 'transa':%c\n",transa_c);
continue;
}
if('u' == diaga_c || 'U' == diaga_c)
diaga = BLIS_UNIT_DIAG;
else if('n' == diaga_c || 'N' == diaga_c)
diaga = BLIS_NONUNIT_DIAG;
else
{
printf("Invalid entry for the argument 'diaga':%c\n", diaga_c);
continue;
}
#else
fprintf(fout, "m\t n\t cs_a\t cs_b\t gflops\n");
printf("~~~~~~~_BLAS\t m\t n\t cs_a\t cs_b\t gflops\n");
while(fscanf(fin, "%ld %ld %ld %ld\n", &m, &n, &cs_a, &cs_b) == 4)
{
side = BLIS_LEFT;
//side = BLIS_RIGHT;
uploa = BLIS_LOWER;
//uploa = BLIS_UPPER;
transa = BLIS_NO_TRANSPOSE;
diaga = BLIS_NONUNIT_DIAG;
#endif
bli_param_map_blis_to_netlib_side( side, &f77_side );
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
if(bli_is_left(side) && ((m > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
if(bli_is_right(side) && ((n > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
if ( bli_is_left( side ) )
bli_obj_create( dt, m, m, 1, m, &a );
else
bli_obj_create( dt, n, n, 1, n, &a );
bli_obj_create( dt, m, n, 1, m, &c );
bli_obj_create( dt, m, n, 1, m, &c_save );
#else
for ( p = p_end; p >= p_begin; p -= p_inc )
{
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
side = BLIS_LEFT;
//side = BLIS_RIGHT;
uploa = BLIS_LOWER;
//uploa = BLIS_UPPER;
transa = BLIS_NO_TRANSPOSE;
diaga = BLIS_NONUNIT_DIAG;
bli_param_map_blis_to_netlib_side( side, &f77_side );
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
if ( bli_is_left( side ) )
bli_obj_create( dt, m, m, 0, 0, &a );
else
bli_obj_create( dt, n, n, 0, 0, &a );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
#endif
bli_randm( &a );
bli_randm( &c );
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
bli_obj_set_uplo( uploa, &a );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_diag( diaga, &a );
// Randomize A and zero the unstored triangle to ensure the
// implementation reads only from the stored region.
bli_randm( &a );
bli_mktrim( &a );
// Load the diagonal of A to make it more likely to be invertible.
bli_shiftd( &BLIS_TWO, &a );
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_setsc( (2.0/1.0), 1.0, &alpha );
bli_copym( &c, &c_save );
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_invertd( &a );
bli_printm( "a", &a, "%4.1f", "" );
bli_invertd( &a );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_trsm( side,
&alpha,
&a,
&c );
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
strsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&nn,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
dtrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&nn,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
ctrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&nn,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ztrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&nn,
alphap,
ap, &lda,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%9.5f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
if ( bli_is_left( side ) )
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
else
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_trsm_blis" );
#else
printf( "data_trsm_%s", BLAS );
#endif
#ifdef FILE_IN_OUT
#ifdef READ_ALL_PARAMS_FROM_FILE
printf("%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n",side_c, uploa_c, transa_c, diaga_c,
(unsigned long )m, (unsigned long ) n,
(unsigned long )cs_a, (unsigned long )cs_b,
gflops);
fprintf(fout,"%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", side_c, uploa_c, transa_c, diaga_c,
(unsigned long )m, (unsigned long ) n,
(unsigned long )cs_a, (unsigned long )cs_b,
gflops);
#else
printf("%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
(unsigned long )cs_a, (unsigned long )cs_b,
gflops);
fprintf(fout,"%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
(unsigned long )cs_a, (unsigned long )cs_b,
gflops);
#endif
fflush(fout);
#else
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )n, gflops );
#endif
bli_obj_free( &alpha );
bli_obj_free( &a );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
#ifdef FILE_IN_OUT
fclose(fin);
fclose(fout);
#endif
//bli_finalize();
return 0;
}

View File

@@ -33,7 +33,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// n alpha x incx y incy

View File

@@ -33,7 +33,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// res n x incx y incy

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"

483
test/test_gemmt.c Normal file
View File

@@ -0,0 +1,483 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
//#define CBLAS
//#define C_STOR_R
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, b, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, k;
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, k_input;
num_t dt;
int r, n_repeats;
uplo_t uploc;
trans_t transa;
trans_t transb;
f77_char f77_uploc;
f77_char f77_transa;
f77_char f77_transb;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
#ifndef PRINT
p_begin = 200;
p_end = 2000;
p_inc = 200;
m_input = -1;
k_input = -1;
#else
p_begin = 16;
p_end = 16;
p_inc = 1;
m_input = 5;
k_input = 4;
#endif
#if 1
//dt = BLIS_FLOAT;
dt = BLIS_DOUBLE;
#else
//dt = BLIS_SCOMPLEX;
dt = BLIS_DCOMPLEX;
#endif
uploc = BLIS_LOWER;
//uploc = BLIS_UPPER;
transa = BLIS_NO_TRANSPOSE;
transb = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
char uplocl = tolower( f77_uploc );
char transal = tolower( f77_transa );
char transbl = tolower( f77_transb );
f77_int cbla_uploc = ( uplocl == 'l' ? CblasLower : CblasUpper );
f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
( void )cbla_uploc;
( void )cbla_transa;
( void )cbla_transb;
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_gemmt_blis" );
#else
printf( "data_gemmt_%s", BLAS );
#endif
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
//for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_end; p_begin <= p; p -= p_inc )
{
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
#ifndef C_STOR_R
if ( bli_does_trans( transa ) )
bli_obj_create( dt, k, m, 0, 0, &a );
else
bli_obj_create( dt, m, k, 0, 0, &a );
if ( bli_does_trans( transb ) )
bli_obj_create( dt, m, k, 0, 0, &b );
else
bli_obj_create( dt, k, m, 0, 0, &b );
bli_obj_create( dt, m, m, 0, 0, &c );
bli_obj_create( dt, m, m, 0, 0, &c_save );
#else
if ( bli_does_trans( transa ) )
bli_obj_create( dt, k, m, -1, -1, &a );
else
bli_obj_create( dt, m, k, -1, -1, &a );
if ( bli_does_trans( transb ) )
bli_obj_create( dt, m, k, -1, -1, &b );
else
bli_obj_create( dt, k, m, -1, -1, &b );
bli_obj_create( dt, m, m, -1, -1, &c );
bli_obj_create( dt, m, m, -1, -1, &c_save );
#endif
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
bli_obj_set_uplo( uploc, &c );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_conjtrans( transb, &b );
bli_setsc( (0.9/1.0), 0.2, &alpha );
bli_setsc( -(1.1/1.0), 0.3, &beta );
bli_copym( &c, &c_save );
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_gemmt( &alpha,
&a,
&b,
&beta,
&c );
#else
#ifndef CBLAS
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
sgemmt_( &f77_uploc,
&f77_transa,
&f77_transb,
&mm,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dgemmt_( &f77_uploc,
&f77_transa,
&f77_transb,
&mm,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cgemmt_( &f77_uploc,
&f77_transa,
&f77_transb,
&mm,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zgemmt_( &f77_uploc,
&f77_transa,
&f77_transb,
&mm,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#else // #ifdef CBLAS
f77_int cbla_storage = ( bli_obj_is_row_stored( &c ) ? CblasRowMajor
: CblasColMajor );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
cblas_sgemmt( cbla_storage,
cbla_uploc,
cbla_transa,
cbla_transb,
mm,
kk,
*alphap,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
cblas_dgemmt( cbla_storage,
cbla_uploc,
cbla_transa,
cbla_transb,
mm,
kk,
*alphap,
ap, lda,
bp, ldb,
*betap,
cp, ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cblas_cgemmt( cbla_storage,
cbla_uploc,
cbla_transa,
cbla_transb,
mm,
kk,
alphap,
ap, lda,
bp, ldb,
betap,
cp, ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
#ifdef C_STOR_R
f77_int lda = bli_obj_row_stride( &a );
f77_int ldb = bli_obj_row_stride( &b );
f77_int ldc = bli_obj_row_stride( &c );
#else
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
#endif
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
cblas_zgemmt( cbla_storage,
cbla_uploc,
cbla_transa,
cbla_transb,
mm,
kk,
alphap,
ap, lda,
bp, ldb,
betap,
cp, ldc );
}
#endif
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_gemmt_blis" );
#else
printf( "data_gemmt_%s", BLAS );
#endif
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin)/p_inc + 1,
( unsigned long )m,
( unsigned long )k, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &b );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// transa m n alpha a lda x incx beta y incy

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// m n alpha x incx y incy a lda

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// uploa m alpha a lda x incx beta y incy

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// uplo m alpha x incx a lda

View File

@@ -32,7 +32,11 @@
*/
#ifdef WIN32
#include <io.h>
#else
#include <unistd.h>
#endif
#include "blis.h"
// uplo m alpha x incx y incy a lda

Some files were not shown because too many files have changed in this diff Show More