mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Squash-merge 'pr' into 'squash'. (#457)
Merged contributions from AMD's AOCL BLIS (#448). Details: - Added support for level-3 operation gemmt, which performs a gemm on only the lower or upper triangle of a square matrix C. For now, only the conventional/large code path will be supported (in vanilla BLIS). This was accomplished by leveraging the existing variant logic for herk. However, some of the infrastructure to support a gemmtsup is included in this commit, including - A bli_gemmtsup() front-end, similar to bli_gemmsup(). - A bli_gemmtsup_ref() reference handler function. - A bli_gemmtsup_int() variant chooser function (with variant calls commented out). - Added support for inducing complex domain gemmt via the 1m method. - Added gemmt APIs to the BLAS and CBLAS compatiblity layers. - Added gemmt test module to testsuite. - Added standalone gemmt test driver to 'test' directory. - Documented gemmt APIs in BLISObjectAPI.md and BLISTypedAPI.md. - Added a C++ template header (blis.hh) containing a BLAS-inspired wrapper to a set of polymorphic CBLAS-like function wrappers defined in another header (cblas.hh). These two headers are installed if running the 'install' target with INSTALL_HH is set to 'yes'. (Also added a set of unit tests that exercise blis.hh, although they are disabled for now because they aren't compatible with out-of-tree builds.) These files now live in the 'vendor' top-level directory. - Various updates to 'zen' and 'zen2' subconfigurations, particularly within the context initialization functions. - Added s and d copyv, setv, and swapv kernels to kernels/zen/1, and various minor updates to dotv and scalv kernels. Also added various sup kernels contributed by AMD to kernels/zen/3. However, these kernels are (for now) not yet used, in part because they caused AppVeyor clang failures, and also because I have not found time to review and vet them. - Output the python found during configure into the definition of PYTHON in build/config.mk (via build/config.mk.in). - Added early-return checks (A, B, or C with zero dimension; alpha = 0) to bli_gemm_front.c. - Implemented explicit beta = 0 handling in for the sgemm ukernel in bli_gemm_armv7a_int_d4x4.c, which was previously missing. This latent bug surfaced because the gemmt module verifies its computation using gemm with its beta parameter set to zero, which, on a cortexa15 system caused the gemm kernel code to unconditionally multiply the uninitialized C data by beta. The C matrix likely contained non-numeric values such as NaN, which then would have resulted in a false failure. - Fixed a bug whereby the implementation for bli_herk_determine_kc(), in bli_l3_blocksize.c, was inadvertantly being defined in terms of helper functions meant for trmm. This bug was probably harmless since the trmm code should have also done the right thing for herk. - Used cpp macros to neutralize the various AOCL_DTL_TRACE_ macros in kernels/zen/3/bli_gemm_small.c since those macros are not used in vanilla BLIS. - Added cpp guard to definition of bli_mem_clear() in bli_mem.h to accommodate C++'s stricter type checking. - Added cpp guard to test/*.c drivers that facilitate compilation on Windows systems. - Various whitespace changes.
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@@ -43,7 +43,12 @@ include/*/*.h
|
||||
# -- misc. --
|
||||
|
||||
# BLIS testsuite output file
|
||||
output.testsuite
|
||||
output.testsuite.*
|
||||
|
||||
# BLAS test output files
|
||||
out.*
|
||||
|
||||
# GTAGS database
|
||||
GPATH
|
||||
GRTAGS
|
||||
GTAGS
|
||||
|
||||
21
Makefile
21
Makefile
@@ -249,6 +249,12 @@ ifeq ($(MK_ENABLE_CBLAS),yes)
|
||||
HEADERS_TO_INSTALL += $(CBLAS_H_FLAT)
|
||||
endif
|
||||
|
||||
# If requested, include AMD's C++ template header files in the list of headers
|
||||
# to install.
|
||||
ifeq ($(INSTALL_HH),yes)
|
||||
HEADERS_TO_INSTALL += $(wildcard $(VEND_CPP_PATH)/*.hh)
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
@@ -892,6 +898,19 @@ else
|
||||
@- $(TESTSUITE_CHECK_PATH) $(TESTSUITE_OUT_FILE)
|
||||
endif
|
||||
|
||||
|
||||
# --- AMD's C++ template header test rules ---
|
||||
|
||||
# NOTE: The targets below won't work as intended for an out-of-tree build,
|
||||
# and so it's disabled for now.
|
||||
|
||||
#testcpp: testvendcpp
|
||||
|
||||
# Recursively run the test for AMD's C++ template header.
|
||||
#testvendcpp:
|
||||
# $(MAKE) -C $(VEND_TESTCPP_PATH)
|
||||
|
||||
|
||||
# --- Install header rules ---
|
||||
|
||||
install-headers: check-env $(MK_INCL_DIR_INST)
|
||||
@@ -1167,11 +1186,13 @@ ifeq ($(IS_CONFIGURED),yes)
|
||||
ifeq ($(ENABLE_VERBOSE),yes)
|
||||
- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
# - $(MAKE) -C $(VEND_TESTCPP_DIR) clean
|
||||
else
|
||||
@echo "Removing object files from $(TESTSUITE_DIR)/$(OBJ_DIR)"
|
||||
@- $(FIND) $(TESTSUITE_DIR)/$(OBJ_DIR) -name "*.o" | $(XARGS) $(RM_F)
|
||||
@echo "Removing binary $(TESTSUITE_DIR)/$(TESTSUITE_BIN)"
|
||||
@- $(RM_F) $(TESTSUITE_DIR)/$(TESTSUITE_BIN)
|
||||
# @$(MAKE) -C $(VEND_TESTCPP_DIR) clean
|
||||
endif # ENABLE_VERBOSE
|
||||
endif # IS_CONFIGURED
|
||||
|
||||
|
||||
@@ -103,6 +103,9 @@ RANLIB := @RANLIB@
|
||||
# Archiver.
|
||||
AR := @AR@
|
||||
|
||||
# Python Interpreter
|
||||
PYTHON := @PYTHON@
|
||||
|
||||
# Preset (required) CFLAGS and LDFLAGS. These variables capture the value
|
||||
# of the CFLAGS and LDFLAGS environment variables at configure-time (and/or
|
||||
# the value of CFLAGS/LDFLAGS if either was specified on the command line).
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2019, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -299,6 +299,10 @@ INCLUDE_DIR := include
|
||||
BLASTEST_DIR := blastest
|
||||
TESTSUITE_DIR := testsuite
|
||||
|
||||
VEND_DIR := vendor
|
||||
VEND_CPP_DIR := $(VEND_DIR)/cpp
|
||||
VEND_TESTCPP_DIR := $(VEND_DIR)/testcpp
|
||||
|
||||
# The filename suffix for reference kernels.
|
||||
REFNM := ref
|
||||
|
||||
@@ -358,6 +362,10 @@ REFKERN_PATH := $(DIST_PATH)/$(REFKERN_DIR)
|
||||
KERNELS_PATH := $(DIST_PATH)/$(KERNELS_DIR)
|
||||
SANDBOX_PATH := $(DIST_PATH)/$(SANDBOX_DIR)
|
||||
|
||||
# Construct paths to some optional C++ template headers contributed by AMD.
|
||||
VEND_CPP_PATH := $(DIST_PATH)/$(VEND_CPP_DIR)
|
||||
VEND_TESTCPP_PATH := $(DIST_PATH)/$(VEND_TESTCPP_DIR)
|
||||
|
||||
# Construct paths to the makefile fragments for the four primary directories
|
||||
# of source code: the config directory, general framework code, reference
|
||||
# kernel code, and optimized kernel code.
|
||||
|
||||
@@ -55,11 +55,19 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, 0, 0 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
|
||||
#if 1
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 4, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 176, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 368, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 );
|
||||
#endif
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
|
||||
@@ -67,6 +67,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
@@ -90,11 +91,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
@@ -106,9 +107,11 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
|
||||
@@ -60,10 +60,8 @@ ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
#CKVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
|
||||
# When compiling with AOCC, add these flags to the default flags set above.
|
||||
ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM.2.0.0')),1)
|
||||
ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM')),1)
|
||||
CKVECFLAGS += -mllvm -disable-licm-vrp
|
||||
endif
|
||||
else
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -52,27 +52,43 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
8,
|
||||
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Update the context with optimized level-1m (packm) kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
2,
|
||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
@@ -83,11 +99,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
@@ -96,12 +112,21 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// copyv
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
#endif
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
@@ -110,6 +135,16 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// setv
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||
|
||||
// swapv
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -125,29 +160,22 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
a) If BLIS is run in a multi-instance mode with
|
||||
CPU freq 2.6/2.2 Ghz
|
||||
DDR4 clock frequency 2400Mhz
|
||||
mc = 240, kc = 512, and nc = 2040
|
||||
mc = 240, kc = 512, and nc = 2040
|
||||
has better performance on EPYC server, over the default block sizes.
|
||||
|
||||
b) If BLIS is run in Single Instance mode
|
||||
mc = 510, kc = 1024 and nc = 4080
|
||||
mc = 510, kc = 1024 and nc = 4080
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
|
||||
// Zen optmized level 3 cache block sizes
|
||||
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
|
||||
#endif
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
|
||||
#endif
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
@@ -171,10 +199,10 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 256, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 256, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 220, 220, -1, -1 );
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
@@ -186,15 +214,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize the context with the sup handlers.
|
||||
bli_cntx_set_l3_sup_handlers
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM, bli_gemmsup_ref,
|
||||
//BLIS_GEMMT, bli_gemmtsup_ref,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
@@ -218,6 +245,33 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
||||
#if 0
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// NOTE: This set of kernels is likely broken and therefore disabled.
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -227,9 +281,17 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
||||
9, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
|
||||
#if 0
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
||||
9, 9, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
||||
#endif
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
|
||||
@@ -65,6 +65,17 @@
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
|
||||
|
||||
#if 0
|
||||
// Allow the sup implementation to combine some small edge case iterations in
|
||||
// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
|
||||
// block-panel algorithm (NR) with the last full iteration that precedes it.
|
||||
// NOTE: These cpp macros need to be explicitly set to an integer since they
|
||||
// are used at compile-time to create unconditional branches or dead code
|
||||
// regions.
|
||||
#define BLIS_ENABLE_SUP_MR_EXT 1
|
||||
#define BLIS_ENABLE_SUP_NR_EXT 0
|
||||
#endif
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -64,13 +64,24 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Update the context with optimized level-1m (packm) kernels.
|
||||
bli_cntx_set_packm_kers
|
||||
(
|
||||
2,
|
||||
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
|
||||
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
@@ -80,28 +91,39 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
#if 1
|
||||
16,
|
||||
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
|
||||
// axpyv
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
|
||||
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
|
||||
// scalv
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
|
||||
//swap
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
|
||||
//copy
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
|
||||
//set
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -119,7 +141,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
#endif
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
@@ -195,6 +217,33 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
|
||||
#if 0
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// NOTE: This set of kernels is likely broken and therefore disabled.
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
|
||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
|
||||
@@ -60,11 +60,27 @@
|
||||
#define BLIS_ENABLE_SMALL_MATRIX_ROME
|
||||
#define BLIS_SMALL_MATRIX_THRES_ROME 400
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ROME 120
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 60
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
|
||||
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
|
||||
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50
|
||||
|
||||
// When running HPL with pure MPI without DGEMM threading (Single-threaded
|
||||
// BLIS), defining this macro as 1 yields better performance.
|
||||
|
||||
8
configure
vendored
8
configure
vendored
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
# Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -1363,6 +1363,9 @@ get_compiler_version()
|
||||
if [ "${cc_vendor}" = "icc" -o \
|
||||
"${cc_vendor}" = "gcc" ]; then
|
||||
cc_version=$(${cc} -dumpversion)
|
||||
# If compiler is AOCC, first grep for clang and then the version number.
|
||||
elif [ "${cc_vendor}" = "clang" ]; then
|
||||
cc_version=$(echo "${vendor_string}" | egrep -o 'clang version [0-9]+\.[0-9]+\.?[0-9]*' | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')
|
||||
elif [ "${cc_vendor}" = "oneAPI" ]; then
|
||||
# Treat Intel oneAPI's clang as clang, not icc.
|
||||
cc_vendor="clang"
|
||||
@@ -3107,6 +3110,7 @@ main()
|
||||
dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g')
|
||||
cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g')
|
||||
cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g')
|
||||
python_esc=$(echo "${found_python}" | sed 's/\//\\\//g')
|
||||
#sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g')
|
||||
|
||||
# For RANLIB, if the variable is not set, we use a default value of
|
||||
@@ -3211,6 +3215,7 @@ main()
|
||||
| sed -e "s/@CXX@/${cxx_esc}/g" \
|
||||
| sed -e "s/@RANLIB@/${ranlib_esc}/g" \
|
||||
| sed -e "s/@AR@/${ar_esc}/g" \
|
||||
| sed -e "s/@PYTHON@/${python_esc}/g" \
|
||||
| sed -e "s/@libpthread@/${libpthread_esc}/g" \
|
||||
| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
|
||||
| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
|
||||
@@ -3311,7 +3316,6 @@ main()
|
||||
echo "${script_name}: creating ${obj_frame_dirpath}"
|
||||
mkdir -p ${obj_frame_dirpath}
|
||||
|
||||
|
||||
if [ -n "${sandbox_flag}" ]; then
|
||||
|
||||
obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}"
|
||||
|
||||
@@ -1681,6 +1681,27 @@ Observed object properties: `trans?(A)`, `trans?(B)`.
|
||||
|
||||
---
|
||||
|
||||
#### gemmt
|
||||
```c
|
||||
void bli_gemmt
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c
|
||||
);
|
||||
```
|
||||
Perform
|
||||
```
|
||||
C := beta * C + alpha * trans?(A) * trans?(B)
|
||||
```
|
||||
where `C` is an _m x m_ matrix, `trans?(A)` is an _m x k_ matrix, and `trans?(B)` is a _k x m_ matrix. This operation is similar to `bli_gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uplo(C)`.
|
||||
|
||||
Observed object properties: `trans?(A)`, `trans?(B)`, `uplo(C)`.
|
||||
|
||||
---
|
||||
|
||||
#### hemm
|
||||
```c
|
||||
void bli_hemm
|
||||
|
||||
@@ -1213,6 +1213,30 @@ where C is an _m x n_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)`
|
||||
|
||||
---
|
||||
|
||||
#### gemmt
|
||||
```c
|
||||
void bli_?gemmt
|
||||
(
|
||||
uplo_t uploc,
|
||||
trans_t transa,
|
||||
trans_t transb,
|
||||
dim_t m,
|
||||
dim_t k,
|
||||
ctype* alpha,
|
||||
ctype* a, inc_t rsa, inc_t csa,
|
||||
ctype* b, inc_t rsb, inc_t csb,
|
||||
ctype* beta,
|
||||
ctype* c, inc_t rsc, inc_t csc
|
||||
);
|
||||
```
|
||||
Perform
|
||||
```
|
||||
C := beta * C + alpha * transa(A) * transb(B)
|
||||
```
|
||||
where C is an _m x m_ matrix, `transa(A)` is an _m x k_ matrix, and `transb(B)` is a _k x m_ matrix. This operation is similar to `bli_?gemm()` except that it only updates the lower or upper triangle of `C` as specified by `uploc`.
|
||||
|
||||
---
|
||||
|
||||
#### hemm
|
||||
```c
|
||||
void bli_?hemm
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -97,4 +97,4 @@
|
||||
#include "bli_trmm.h"
|
||||
#include "bli_trmm3.h"
|
||||
#include "bli_trsm.h"
|
||||
|
||||
#include "bli_gemmt.h"
|
||||
|
||||
@@ -91,7 +91,7 @@ dim_t PASTEMAC0(opname) \
|
||||
}
|
||||
|
||||
GENFRONT( gemm_determine_kc, gemm )
|
||||
GENFRONT( herk_determine_kc, trmm )
|
||||
GENFRONT( herk_determine_kc, herk )
|
||||
GENFRONT( trmm_determine_kc, trmm )
|
||||
GENFRONT( trsm_determine_kc, trsm )
|
||||
|
||||
|
||||
@@ -63,6 +63,28 @@ void bli_gemm_check
|
||||
//bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
void bli_gemmt_check
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
// Check basic properties of the operation.
|
||||
|
||||
bli_gemmt_basic_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// Check matrix squareness.
|
||||
|
||||
e_val = bli_check_square_object( c );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
void bli_hemm_check
|
||||
(
|
||||
side_t side,
|
||||
@@ -324,6 +346,28 @@ void bli_gemm_basic_check
|
||||
#endif
|
||||
}
|
||||
|
||||
void bli_gemmt_basic_check
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
// Perform standard checks.
|
||||
|
||||
bli_l3_basic_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// Check object dimensions.
|
||||
|
||||
e_val = bli_check_level3_dims( a, b, c );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
void bli_hemm_basic_check
|
||||
(
|
||||
side_t side,
|
||||
|
||||
@@ -51,6 +51,7 @@ void PASTEMAC(opname,_check) \
|
||||
);
|
||||
|
||||
GENPROT( gemm )
|
||||
GENPROT( gemmt )
|
||||
GENPROT( her2k )
|
||||
GENPROT( syr2k )
|
||||
|
||||
@@ -103,6 +104,16 @@ void bli_gemm_basic_check
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_gemmt_basic_check
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
void bli_hemm_basic_check
|
||||
(
|
||||
side_t side,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -71,7 +71,10 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
the function returns with BLIS_FAILURE, which causes execution to
|
||||
proceed towards the conventional implementation. */ \
|
||||
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
if ( result == BLIS_SUCCESS ) return; \
|
||||
if ( result == BLIS_SUCCESS ) \
|
||||
{ \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Only proceed with an induced method if each of the operands have a
|
||||
@@ -101,6 +104,75 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
GENFRONT( gemm )
|
||||
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,EX_SUF) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c \
|
||||
BLIS_OAPI_EX_PARAMS \
|
||||
) \
|
||||
{ \
|
||||
bli_init_once(); \
|
||||
\
|
||||
BLIS_OAPI_EX_DECLS \
|
||||
\
|
||||
/* If the rntm is non-NULL, it may indicate that we should forgo sup
|
||||
handling altogether. */ \
|
||||
/*
|
||||
bool enable_sup = TRUE; \
|
||||
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
|
||||
*/ \
|
||||
\
|
||||
/* NOTE: The sup handling for gemmt is disabled here because gemmtsup
|
||||
is not yet fully implemented. */ \
|
||||
/*
|
||||
if ( enable_sup ) \
|
||||
{ \
|
||||
*/ \
|
||||
/* Execute the small/unpacked oapi handler. If it finds that the problem
|
||||
does not fall within the thresholds that define "small", or for some
|
||||
other reason decides not to use the small/unpacked implementation,
|
||||
the function returns with BLIS_FAILURE, which causes execution to
|
||||
proceed towards the conventional implementation. */ \
|
||||
/*
|
||||
err_t result = PASTEMAC(opname,sup)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
if ( result == BLIS_SUCCESS ) \
|
||||
{ \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/* Only proceed with an induced method if each of the operands have a
|
||||
complex storage datatype. NOTE: Allowing precisions to vary while
|
||||
using 1m, which is what we do here, is unique to gemm; other level-3
|
||||
operations use 1m only if all storage datatypes are equal (and they
|
||||
ignore the computation precision). If any operands are real, skip the
|
||||
induced method chooser function and proceed directly with native
|
||||
execution. */ \
|
||||
if ( bli_obj_is_complex( c ) && \
|
||||
bli_obj_is_complex( a ) && \
|
||||
bli_obj_is_complex( b ) ) \
|
||||
{ \
|
||||
/* FIXME: BLIS does not yet support induced methods for gemmt. Thus,
|
||||
we call the native implementation code path for now. */ \
|
||||
/*PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm );*/ \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENFRONT( gemmt )
|
||||
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
|
||||
);
|
||||
|
||||
GENPROT( gemm )
|
||||
GENPROT( gemmt )
|
||||
GENPROT( her2k )
|
||||
GENPROT( syr2k )
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,6 +58,7 @@ typedef void (*PASTECH(opname,_oft)) \
|
||||
);
|
||||
|
||||
GENTDEF( gemm )
|
||||
GENTDEF( gemmt )
|
||||
GENTDEF( her2k )
|
||||
GENTDEF( syr2k )
|
||||
|
||||
|
||||
@@ -132,3 +132,72 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
|
||||
}
|
||||
|
||||
|
||||
err_t bli_gemmtsup
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// Return early if small matrix handling is disabled at configure-time.
|
||||
#ifdef BLIS_DISABLE_SUP_HANDLING
|
||||
return BLIS_FAILURE;
|
||||
#endif
|
||||
|
||||
// Return early if this is a mixed-datatype computation.
|
||||
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
|
||||
bli_obj_dt( c ) != bli_obj_dt( b ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE;
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
// Return early if the problem dimensions exceed their sup thresholds.
|
||||
// Notice that we do not bother to check whether the microkernel
|
||||
// prefers or dislikes the storage of C, since the same check is called
|
||||
// for either way.
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
// We've now ruled out the possibility that the sup thresholds are
|
||||
// unsatisfied.
|
||||
// This implies that the sup thresholds (at least one of them) are met.
|
||||
// and the small/unpacked handler should be called.
|
||||
// NOTE: The sup handler is free to enforce a stricter threshold regime
|
||||
// if it so chooses, in which case it can/should return BLIS_FAILURE.
|
||||
|
||||
// Query the small/unpacked handler from the context and invoke it.
|
||||
gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
|
||||
|
||||
return
|
||||
gemmtsup_fp
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -43,3 +43,14 @@ err_t bli_gemmsup
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
err_t bli_gemmtsup
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -80,7 +80,10 @@ err_t bli_gemmsup_int
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
|
||||
if ( stor_id == BLIS_XXX )
|
||||
{
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
@@ -240,3 +243,192 @@ err_t bli_gemmsup_int
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
err_t bli_gemmtsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX )
|
||||
{
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = m;
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
bool use_bp = TRUE;
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||
#if 0
|
||||
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
#endif
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||
#if 0
|
||||
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
#endif
|
||||
// *requires nudging of nc up to be a multiple of mr.
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = m / NR; // the m becomes n after a transposition
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||
#if 0
|
||||
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
#endif
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||
#if 0
|
||||
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
#endif
|
||||
// *requires nudging of mc up to be a multiple of nr.
|
||||
}
|
||||
}
|
||||
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -43,3 +43,15 @@ err_t bli_gemmsup_int
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
err_t bli_gemmtsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019-20, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,6 +57,6 @@ typedef err_t (*PASTECH(opname,_oft)) \
|
||||
);
|
||||
|
||||
GENTDEF( gemmsup )
|
||||
|
||||
GENTDEF( gemmtsup )
|
||||
#endif
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -106,3 +106,69 @@ err_t bli_gemmsup_ref
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
err_t bli_gemmtsup_ref
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// This function implements the default gemmtsup handler. If you are a
|
||||
// BLIS developer and wish to use a different gemmtsup handler, please
|
||||
// register a different function pointer in the context in your
|
||||
// sub-configuration's bli_cntx_init_*() function.
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemmt_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
#if 0
|
||||
// NOTE: This special case handling is done within the variants.
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop.
|
||||
bli_rntm_set_ways_from_rntm_sup
|
||||
(
|
||||
bli_obj_length( c ),
|
||||
bli_obj_width( c ),
|
||||
bli_obj_width( a ),
|
||||
rntm
|
||||
);
|
||||
|
||||
return
|
||||
bli_l3_sup_thread_decorator
|
||||
(
|
||||
bli_gemmtsup_int,
|
||||
BLIS_GEMMT, // operation family id
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -43,3 +43,14 @@ err_t bli_gemmsup_ref
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
err_t bli_gemmtsup_ref
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -100,7 +100,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemm )
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemmt )
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, struca ) \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -56,7 +57,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm )
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemmt )
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -53,6 +53,26 @@ void bli_gemm_front
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
|
||||
// and return early.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
|
||||
bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
#if 0
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
// Only handle small problems separately for homogeneous datatypes.
|
||||
@@ -60,23 +80,12 @@ void bli_gemm_front
|
||||
bli_obj_dt( a ) == bli_obj_dt( c ) &&
|
||||
bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
|
||||
{
|
||||
gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
|
||||
err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
|
||||
if ( status == BLIS_SUCCESS ) return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
|
||||
@@ -58,15 +58,18 @@ void bli_gemm_int
|
||||
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) ) return;
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -78,9 +81,9 @@ void bli_gemm_int
|
||||
// This should never execute.
|
||||
bli_abort();
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -93,14 +96,14 @@ void bli_gemm_int
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Create the next node in the thrinfo_t structure.
|
||||
@@ -129,7 +132,7 @@ void bli_gemm_int
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
36
frame/3/gemmt/bli_gemmt.h
Normal file
36
frame/3/gemmt/bli_gemmt.h
Normal file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_gemmt_front.h"
|
||||
|
||||
142
frame/3/gemmt/bli_gemmt_front.c
Normal file
142
frame/3/gemmt/bli_gemmt_front.c
Normal file
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemmt_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemmt_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
|
||||
// and return early.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
|
||||
bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMM,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
// in bli_packm_init().
|
||||
if ( bli_cntx_method( cntx ) == BLIS_NAT )
|
||||
{
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_HERK, // operation family id (gemmt uses 'herk' family)
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
}
|
||||
|
||||
46
frame/3/gemmt/bli_gemmt_front.h
Normal file
46
frame/3/gemmt/bli_gemmt_front.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemmt_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
);
|
||||
@@ -38,8 +38,8 @@ BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level );
|
||||
|
||||
BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void );
|
||||
|
||||
void bli_print_msg( char* str, char* file, guint_t line );
|
||||
BLIS_EXPORT_BLIS void bli_abort( void );
|
||||
void bli_print_msg( char* str, char* file, guint_t line );
|
||||
BLIS_EXPORT_BLIS void bli_abort( void );
|
||||
|
||||
char* bli_error_string_for_code( gint_t code );
|
||||
char* bli_error_string_for_code( gint_t code );
|
||||
|
||||
|
||||
@@ -147,7 +147,14 @@ BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
|
||||
BLIS_INLINE void bli_mem_clear( mem_t* mem )
|
||||
{
|
||||
bli_mem_set_buffer( NULL, mem );
|
||||
#ifdef __cplusplus
|
||||
const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE;
|
||||
// When using C++, which is strongly typed, we avoid use of -1 as a
|
||||
// packbuf_t value since it will result in a compile-time error.
|
||||
bli_mem_set_buf_type( pb, mem );
|
||||
#else
|
||||
bli_mem_set_buf_type( ( packbuf_t )-1, mem );
|
||||
#endif
|
||||
bli_mem_set_pool( NULL, mem );
|
||||
bli_mem_set_size( 0, mem );
|
||||
}
|
||||
|
||||
234
frame/compat/bla_gemmt.c
Normal file
234
frame/compat/bla_gemmt.c
Normal file
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
|
||||
#ifdef BLIS_BLAS3_CALLS_TAPI
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
uplo_t blis_uploc; \
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
dim_t m0, k0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
inc_t rs_b, cs_b; \
|
||||
inc_t rs_c, cs_c; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
uploc, \
|
||||
transa, \
|
||||
transb, \
|
||||
m, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
rs_b = 1; \
|
||||
cs_b = *ldb; \
|
||||
rs_c = 1; \
|
||||
cs_c = *ldc; \
|
||||
\
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_uploc, \
|
||||
blis_transa, \
|
||||
blis_transb, \
|
||||
m0, \
|
||||
k0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
(ftype*)b, rs_b, cs_b, \
|
||||
(ftype*)beta, \
|
||||
(ftype*)c, rs_c, cs_c, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
uplo_t blis_uploc; \
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
dim_t m0, k0; \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
uploc, \
|
||||
transa, \
|
||||
transb, \
|
||||
m, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *m, m0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = *lda; \
|
||||
const inc_t rs_b = 1; \
|
||||
const inc_t cs_b = *ldb; \
|
||||
const inc_t rs_c = 1; \
|
||||
const inc_t cs_c = *ldc; \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
const struc_t strucc = BLIS_SYMMETRIC; \
|
||||
\
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER; \
|
||||
\
|
||||
dim_t m0_a, n0_a; \
|
||||
dim_t m0_b, n0_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
|
||||
bli_set_dims_with_trans( blis_transb, k0, m0, &m0_b, &n0_b ); \
|
||||
\
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
|
||||
\
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( blis_uploc, &co ); \
|
||||
bli_obj_set_conjtrans( blis_transa, &ao ); \
|
||||
bli_obj_set_conjtrans( blis_transb, &bo ); \
|
||||
\
|
||||
bli_obj_set_struc( strucc, &co ); \
|
||||
\
|
||||
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
|
||||
( \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
INSERT_GENTFUNC_BLAS( gemmt, gemmt )
|
||||
#endif
|
||||
|
||||
60
frame/compat/bla_gemmt.h
Normal file
60
frame/compat/bla_gemmt.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ftype, ch, blasname ) \
|
||||
\
|
||||
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* m, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
INSERT_GENTPROT_BLAS( gemmt )
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -185,6 +186,7 @@
|
||||
#include "bla_syr2k.h"
|
||||
#include "bla_trmm.h"
|
||||
#include "bla_trsm.h"
|
||||
#include "bla_gemmt.h"
|
||||
|
||||
#include "bla_gemm_check.h"
|
||||
#include "bla_hemm_check.h"
|
||||
@@ -195,6 +197,7 @@
|
||||
#include "bla_syr2k_check.h"
|
||||
#include "bla_trmm_check.h"
|
||||
#include "bla_trsm_check.h"
|
||||
#include "bla_gemmt_check.h"
|
||||
|
||||
// -- Fortran-compatible APIs to BLIS functions --
|
||||
|
||||
|
||||
@@ -448,6 +448,11 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
float alpha, const float *A, f77_int lda,
|
||||
float *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, float alpha, const float *A,
|
||||
f77_int lda, const float *B, f77_int ldb,
|
||||
float beta, float *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -478,6 +483,11 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
double alpha, const double *A, f77_int lda,
|
||||
double *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, double alpha, const double *A,
|
||||
f77_int lda, const double *B, f77_int ldb,
|
||||
double beta, double *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -508,6 +518,11 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
const void *alpha, const void *A, f77_int lda,
|
||||
void *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -538,6 +553,11 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
const void *alpha, const void *A, f77_int lda,
|
||||
void *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc);
|
||||
|
||||
|
||||
/*
|
||||
|
||||
166
frame/compat/cblas/src/cblas_cgemmt.c
Normal file
166
frame/compat/cblas/src/cblas_cgemmt.c
Normal file
@@ -0,0 +1,166 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
cblas_cgemmt.c
|
||||
Based off of cblas_cgemm.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
|
||||
const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc)
|
||||
{
|
||||
char UL, TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_UL, F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_UL &UL
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
|
||||
if( Uplo == CblasUpper) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)A,
|
||||
&F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper) UL='L';
|
||||
else if ( Uplo == CblasLower ) UL='U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B,
|
||||
&F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
166
frame/compat/cblas/src/cblas_dgemmt.c
Normal file
166
frame/compat/cblas/src/cblas_dgemmt.c
Normal file
@@ -0,0 +1,166 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
cblas_dgemmt.c
|
||||
Based off of cblas_dgemm.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
|
||||
double alpha, const double *A,
|
||||
f77_int lda, const double *B, f77_int ldb,
|
||||
double beta, double *C, f77_int ldc)
|
||||
{
|
||||
char UL, TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_UL, F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_UL &UL
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
|
||||
if( Uplo == CblasUpper) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
|
||||
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper) UL='L';
|
||||
else if ( Uplo == CblasLower ) UL='U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
|
||||
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
@@ -1,12 +1,46 @@
|
||||
/*
|
||||
* cblas_f77.h
|
||||
* Written by Keita Teranishi
|
||||
*
|
||||
* Updated by Jeff Horner
|
||||
* Merged cblas_f77.h and cblas_fortran_header.h
|
||||
*
|
||||
* (Heavily hacked down from the original)
|
||||
*/
|
||||
cblas_f77.h
|
||||
Written by Keita Teranishi
|
||||
|
||||
Updated by Jeff Horner
|
||||
Merged cblas_f77.h and cblas_fortran_header.h
|
||||
|
||||
(Heavily hacked down from the original)
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef CBLAS_F77_H
|
||||
#define CBLAS_F77_H
|
||||
@@ -163,5 +197,12 @@
|
||||
#define F77_zsyr2k zsyr2k_
|
||||
#define F77_ztrmm ztrmm_
|
||||
#define F77_ztrsm ztrsm_
|
||||
/*
|
||||
* BLAS extensions
|
||||
*/
|
||||
#define F77_sgemmt sgemmt_
|
||||
#define F77_dgemmt dgemmt_
|
||||
#define F77_cgemmt cgemmt_
|
||||
#define F77_zgemmt zgemmt_
|
||||
|
||||
#endif /* CBLAS_F77_H */
|
||||
|
||||
166
frame/compat/cblas/src/cblas_sgemmt.c
Normal file
166
frame/compat/cblas/src/cblas_sgemmt.c
Normal file
@@ -0,0 +1,166 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
cblas_sgemmt.c
|
||||
Based off of cblas_sgemm.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
|
||||
float alpha, const float *A,
|
||||
f77_int lda, const float *B, f77_int ldb,
|
||||
float beta, float *C, f77_int ldc)
|
||||
{
|
||||
char UL, TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_UL, F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_UL &UL
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
|
||||
if( Uplo == CblasUpper) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A,
|
||||
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper) UL='L';
|
||||
else if ( Uplo == CblasLower ) UL='U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B,
|
||||
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
166
frame/compat/cblas/src/cblas_zgemmt.c
Normal file
166
frame/compat/cblas/src/cblas_zgemmt.c
Normal file
@@ -0,0 +1,166 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
cblas_zgemmt.c
|
||||
Based off of cblas_zgemm.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K,
|
||||
const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc)
|
||||
{
|
||||
char UL, TA, TB;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_UL, F77_TA, F77_TB;
|
||||
#else
|
||||
#define F77_UL &UL
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_M M
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
|
||||
if( Uplo == CblasUpper) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)A,
|
||||
&F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper) UL='L';
|
||||
else if ( Uplo == CblasLower ) UL='U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
|
||||
&F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
92
frame/compat/check/bla_gemmt_check.h
Normal file
92
frame/compat/check/bla_gemmt_check.h
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
#define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \
|
||||
{ \
|
||||
f77_int info = 0; \
|
||||
f77_int nota, notb; \
|
||||
f77_int conja, conjb; \
|
||||
f77_int ta, tb; \
|
||||
f77_int lower, upper; \
|
||||
f77_int nrowa, nrowb; \
|
||||
\
|
||||
nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
\
|
||||
lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \
|
||||
upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \
|
||||
\
|
||||
if ( nota ) { nrowa = *m; } \
|
||||
else { nrowa = *k; } \
|
||||
if ( notb ) { nrowb = *k; } \
|
||||
else { nrowb = *m; } \
|
||||
\
|
||||
if ( !lower && !upper ) \
|
||||
info = 1; \
|
||||
else if ( !nota && !conja && !ta ) \
|
||||
info = 2; \
|
||||
else if ( !notb && !conjb && !tb ) \
|
||||
info = 3; \
|
||||
else if ( *m < 0 ) \
|
||||
info = 4; \
|
||||
else if ( *k < 0 ) \
|
||||
info = 5; \
|
||||
else if ( *lda < bli_max( 1, nrowa ) ) \
|
||||
info = 8; \
|
||||
else if ( *ldb < bli_max( 1, nrowb ) ) \
|
||||
info = 10; \
|
||||
else if ( *ldc < bli_max( 1, *m ) ) \
|
||||
info = 13; \
|
||||
\
|
||||
if ( info != 0 ) \
|
||||
{ \
|
||||
char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
|
||||
\
|
||||
sprintf( func_str, "%s%-5s", dt_str, op_str ); \
|
||||
\
|
||||
bli_string_mkupper( func_str ); \
|
||||
\
|
||||
PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -250,9 +250,9 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- AMD64 architectures --
|
||||
|
||||
//#ifdef BLIS_KERNELS_ZEN2
|
||||
//#include "bli_kernels_zen2.h"
|
||||
//#endif
|
||||
#ifdef BLIS_KERNELS_ZEN2
|
||||
#include "bli_kernels_zen2.h"
|
||||
#endif
|
||||
#ifdef BLIS_KERNELS_ZEN
|
||||
#include "bli_kernels_zen.h"
|
||||
#endif
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -918,6 +918,7 @@ typedef enum
|
||||
// bli_l3_ind.c to index into arrays.
|
||||
//
|
||||
BLIS_GEMM = 0,
|
||||
BLIS_GEMMT,
|
||||
BLIS_HEMM,
|
||||
BLIS_HERK,
|
||||
BLIS_HER2K,
|
||||
@@ -931,7 +932,7 @@ typedef enum
|
||||
BLIS_NOID
|
||||
} opid_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL3_OPS 10
|
||||
#define BLIS_NUM_LEVEL3_OPS 11
|
||||
|
||||
|
||||
// -- Blocksize ID type --
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -37,48 +37,49 @@
|
||||
|
||||
static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
|
||||
{
|
||||
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
|
||||
/* 3mh */ { bli_gemm3mh, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
|
||||
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
|
||||
/* 3mh */ { bli_gemm3mh, NULL, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
|
||||
bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL },
|
||||
/* 3m1 */ { bli_gemm3m1, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
|
||||
/* 3m1 */ { bli_gemm3m1, NULL, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
|
||||
bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 },
|
||||
/* 4mh */ { bli_gemm4mh, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
|
||||
/* 4mh */ { bli_gemm4mh, NULL, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
|
||||
bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL },
|
||||
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL,
|
||||
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL, NULL },
|
||||
/* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
|
||||
/* 4m1 */ { bli_gemm4m1, NULL, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
|
||||
bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 },
|
||||
/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
|
||||
/* 1m */ { bli_gemm1m, NULL, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
|
||||
bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m },
|
||||
/* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
|
||||
/* nat */ { bli_gemmnat, bli_gemmtnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
|
||||
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat },
|
||||
};
|
||||
|
||||
//
|
||||
// NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
|
||||
//
|
||||
// BLIS provides APIs to modify this state during runtime. So, one application thread
|
||||
// can modify the state, before another starts the corresponding BLIS operation.
|
||||
// This is solved by making the induced method status array local to threads.
|
||||
// BLIS provides APIs to modify this state during runtime. So, it's possible for one
|
||||
// application thread to modify the state before another starts the corresponding
|
||||
// BLIS operation. This is solved by making the induced method status array local to
|
||||
// threads.
|
||||
|
||||
static BLIS_THREAD_LOCAL
|
||||
bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
|
||||
{
|
||||
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
|
||||
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
|
||||
/* c z */
|
||||
/* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 3mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* 3m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 3m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* 4mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 4mh */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* 4mb */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 4mb */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 4m1 */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
|
||||
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
|
||||
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
|
||||
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
|
||||
{TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} },
|
||||
};
|
||||
|
||||
@@ -99,6 +100,7 @@ bool PASTEMAC(opname,ind_has_avail)( num_t dt )
|
||||
*/
|
||||
|
||||
GENFUNC( gemm, BLIS_GEMM )
|
||||
GENFUNC( gemmt, BLIS_GEMMT )
|
||||
GENFUNC( hemm, BLIS_HEMM )
|
||||
GENFUNC( herk, BLIS_HERK )
|
||||
GENFUNC( her2k, BLIS_HER2K )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -44,6 +45,7 @@ void_fp PASTEMAC(opname,ind_get_avail)( num_t dt );
|
||||
/*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */
|
||||
|
||||
GENPROT( gemm )
|
||||
GENPROT( gemmt )
|
||||
GENPROT( hemm )
|
||||
GENPROT( herk )
|
||||
GENPROT( her2k )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -67,6 +67,7 @@ void PASTEMAC(opname,imeth) \
|
||||
}
|
||||
|
||||
GENFRONT( gemm, ind )
|
||||
GENFRONT( gemmt, ind )
|
||||
GENFRONT( her2k, ind )
|
||||
GENFRONT( syr2k, ind )
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -41,6 +42,7 @@
|
||||
#define GENPROT( imeth ) \
|
||||
\
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(gemm,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(hemm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(herk,imeth) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(her2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -41,7 +41,7 @@
|
||||
// of executing one iteration of a for loop, plus the overhead of calling a
|
||||
// function that does nothing (ie: the _cntx_init_stage() function).
|
||||
|
||||
// -- gemm/her2k/syr2k ---------------------------------------------------------
|
||||
// -- gemm/her2k/syr2k/gemmt ---------------------------------------------------
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, cname, imeth ) \
|
||||
@@ -80,6 +80,7 @@ void PASTEMAC(opname,imeth) \
|
||||
#ifndef BLIS_ENABLE_SANDBOX
|
||||
GENFRONT( gemm, gemm, nat )
|
||||
#endif
|
||||
GENFRONT( gemmt, gemm, nat )
|
||||
GENFRONT( her2k, gemm, nat )
|
||||
GENFRONT( syr2k, gemm, nat )
|
||||
|
||||
|
||||
@@ -80,46 +80,56 @@ void bli_sgemm_armv7a_int_4x4
|
||||
// Vector for column 3
|
||||
float32x4_t cv3;
|
||||
|
||||
if( rs_c == 1 )
|
||||
if ( *beta != 0.0F )
|
||||
{
|
||||
// Load column 0
|
||||
cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
|
||||
|
||||
// Load column 1
|
||||
cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
|
||||
|
||||
// Load column 2
|
||||
cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
|
||||
|
||||
// Load column 3
|
||||
cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
|
||||
}
|
||||
if ( rs_c == 1 )
|
||||
{
|
||||
// Load column 0
|
||||
cv0 = vld1q_f32( c + 0*rs_c + 0*cs_c );
|
||||
|
||||
// Load column 1
|
||||
cv1 = vld1q_f32( c + 0*rs_c + 1*cs_c );
|
||||
|
||||
// Load column 2
|
||||
cv2 = vld1q_f32( c + 0*rs_c + 2*cs_c );
|
||||
|
||||
// Load column 3
|
||||
cv3 = vld1q_f32( c + 0*rs_c + 3*cs_c );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Load column 0
|
||||
cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
|
||||
cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
|
||||
cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
|
||||
cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
|
||||
|
||||
// Load column 1
|
||||
cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
|
||||
cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
|
||||
cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
|
||||
cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
|
||||
|
||||
// Load column 2
|
||||
cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
|
||||
cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
|
||||
cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
|
||||
cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
|
||||
|
||||
// Load column 3
|
||||
cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
|
||||
cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
|
||||
cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
|
||||
cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Load column 0
|
||||
cv0 = vld1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
|
||||
cv0 = vld1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
|
||||
cv0 = vld1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
|
||||
cv0 = vld1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
|
||||
|
||||
// Load column 1
|
||||
cv1 = vld1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
|
||||
cv1 = vld1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
|
||||
cv1 = vld1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
|
||||
cv1 = vld1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
|
||||
|
||||
// Load column 2
|
||||
cv2 = vld1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
|
||||
cv2 = vld1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
|
||||
cv2 = vld1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
|
||||
cv2 = vld1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
|
||||
|
||||
// Load column 3
|
||||
cv3 = vld1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
|
||||
cv3 = vld1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
|
||||
cv3 = vld1q_lane_f32( c + 2*rs_c + 3*cs_c, cv3, 2);
|
||||
cv3 = vld1q_lane_f32( c + 3*rs_c + 3*cs_c, cv3, 3);
|
||||
|
||||
cv0 = vmovq_n_f32( 0.0 );
|
||||
cv1 = vmovq_n_f32( 0.0 );
|
||||
cv2 = vmovq_n_f32( 0.0 );
|
||||
cv3 = vmovq_n_f32( 0.0 );
|
||||
}
|
||||
|
||||
// Vector for accummulating column 0
|
||||
@@ -142,15 +152,15 @@ void bli_sgemm_armv7a_int_4x4
|
||||
// Initialize vector to 0.0
|
||||
abv3 = vmovq_n_f32( 0.0 );
|
||||
|
||||
for ( i = 0; i < k_iter; ++i )
|
||||
{
|
||||
for ( i = 0; i < k_iter; ++i )
|
||||
{
|
||||
// Begin iter 0
|
||||
av1 = vld1q_f32( a );
|
||||
av1 = vld1q_f32( a );
|
||||
|
||||
__builtin_prefetch( a + 224 );
|
||||
__builtin_prefetch( b + 224 );
|
||||
|
||||
bv1 = vld1q_f32( b );
|
||||
|
||||
bv1 = vld1q_f32( b );
|
||||
|
||||
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
|
||||
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
|
||||
@@ -158,24 +168,24 @@ void bli_sgemm_armv7a_int_4x4
|
||||
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
|
||||
|
||||
|
||||
av2 = vld1q_f32( a+4 );
|
||||
av2 = vld1q_f32( a+4 );
|
||||
|
||||
//__builtin_prefetch( a + 116 );
|
||||
//__builtin_prefetch( b + 116 );
|
||||
|
||||
bv2 = vld1q_f32( b+4 );
|
||||
|
||||
bv2 = vld1q_f32( b+4 );
|
||||
|
||||
abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 );
|
||||
abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 );
|
||||
abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 );
|
||||
abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 );
|
||||
|
||||
av3 = vld1q_f32( a+8 );
|
||||
av3 = vld1q_f32( a+8 );
|
||||
|
||||
//__builtin_prefetch( a + 120 );
|
||||
//__builtin_prefetch( b + 120 );
|
||||
|
||||
bv3 = vld1q_f32( b+8 );
|
||||
|
||||
bv3 = vld1q_f32( b+8 );
|
||||
|
||||
abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 );
|
||||
abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 );
|
||||
@@ -183,12 +193,12 @@ void bli_sgemm_armv7a_int_4x4
|
||||
abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 );
|
||||
|
||||
|
||||
av4 = vld1q_f32( a+12);
|
||||
av4 = vld1q_f32( a+12);
|
||||
|
||||
//__builtin_prefetch( a + 124 );
|
||||
//__builtin_prefetch( b + 124 );
|
||||
|
||||
bv4 = vld1q_f32( b+12);
|
||||
|
||||
bv4 = vld1q_f32( b+12);
|
||||
|
||||
abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 );
|
||||
abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 );
|
||||
@@ -197,71 +207,85 @@ void bli_sgemm_armv7a_int_4x4
|
||||
|
||||
|
||||
|
||||
a += 16;
|
||||
b += 16;
|
||||
}
|
||||
a += 16;
|
||||
b += 16;
|
||||
}
|
||||
|
||||
for ( i = 0; i < k_left; ++i )
|
||||
{
|
||||
av1 = vld1q_f32( a );
|
||||
for ( i = 0; i < k_left; ++i )
|
||||
{
|
||||
av1 = vld1q_f32( a );
|
||||
|
||||
__builtin_prefetch( a + 112 );
|
||||
__builtin_prefetch( b + 112 );
|
||||
|
||||
bv1 = vld1q_f32( b );
|
||||
|
||||
bv1 = vld1q_f32( b );
|
||||
|
||||
abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 );
|
||||
abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 );
|
||||
abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 );
|
||||
abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 );
|
||||
|
||||
a += 4;
|
||||
b += 4;
|
||||
a += 4;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
__builtin_prefetch( a_next );
|
||||
__builtin_prefetch( b_next );
|
||||
|
||||
cv0 = vmulq_n_f32( cv0, *beta );
|
||||
cv1 = vmulq_n_f32( cv1, *beta );
|
||||
cv2 = vmulq_n_f32( cv2, *beta );
|
||||
cv3 = vmulq_n_f32( cv3, *beta );
|
||||
if ( *beta != 0.0F )
|
||||
{
|
||||
// Multiply C by beta and then accumulate alpha * A * B.
|
||||
cv0 = vmulq_n_f32( cv0, *beta );
|
||||
cv1 = vmulq_n_f32( cv1, *beta );
|
||||
cv2 = vmulq_n_f32( cv2, *beta );
|
||||
cv3 = vmulq_n_f32( cv3, *beta );
|
||||
|
||||
cv0 = vmlaq_f32( cv0, abv0, alphav );
|
||||
cv1 = vmlaq_f32( cv1, abv1, alphav );
|
||||
cv2 = vmlaq_f32( cv2, abv2, alphav );
|
||||
cv3 = vmlaq_f32( cv3, abv3, alphav );
|
||||
cv0 = vmlaq_f32( cv0, abv0, alphav );
|
||||
cv1 = vmlaq_f32( cv1, abv1, alphav );
|
||||
cv2 = vmlaq_f32( cv2, abv2, alphav );
|
||||
cv3 = vmlaq_f32( cv3, abv3, alphav );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Since beta = 0, skip straight to accumulating alpha * A * B.
|
||||
// Note: C (cv?) was initialized to zero above.
|
||||
cv0 = vmlaq_f32( cv0, abv0, alphav );
|
||||
cv1 = vmlaq_f32( cv1, abv1, alphav );
|
||||
cv2 = vmlaq_f32( cv2, abv2, alphav );
|
||||
cv3 = vmlaq_f32( cv3, abv3, alphav );
|
||||
}
|
||||
|
||||
if( rs_c == 1 )
|
||||
if ( rs_c == 1 )
|
||||
{
|
||||
// Store column 0
|
||||
vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
|
||||
vst1q_f32( c + 0*rs_c + 0*cs_c, cv0 );
|
||||
// Store column 1
|
||||
vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
|
||||
vst1q_f32( c + 0*rs_c + 1*cs_c, cv1 );
|
||||
// Store column 2
|
||||
vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
|
||||
vst1q_f32( c + 0*rs_c + 2*cs_c, cv2 );
|
||||
// Store column 3
|
||||
vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
|
||||
vst1q_f32( c + 0*rs_c + 3*cs_c, cv3 );
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
// Store column 0
|
||||
vst1q_lane_f32( c + 0*rs_c + 0*cs_c, cv0, 0);
|
||||
vst1q_lane_f32( c + 1*rs_c + 0*cs_c, cv0, 1);
|
||||
vst1q_lane_f32( c + 2*rs_c + 0*cs_c, cv0, 2);
|
||||
vst1q_lane_f32( c + 3*rs_c + 0*cs_c, cv0, 3);
|
||||
|
||||
|
||||
// Store column 1
|
||||
vst1q_lane_f32( c + 0*rs_c + 1*cs_c, cv1, 0);
|
||||
vst1q_lane_f32( c + 1*rs_c + 1*cs_c, cv1, 1);
|
||||
vst1q_lane_f32( c + 2*rs_c + 1*cs_c, cv1, 2);
|
||||
vst1q_lane_f32( c + 3*rs_c + 1*cs_c, cv1, 3);
|
||||
|
||||
|
||||
// Store column 2
|
||||
vst1q_lane_f32( c + 0*rs_c + 2*cs_c, cv2, 0);
|
||||
vst1q_lane_f32( c + 1*rs_c + 2*cs_c, cv2, 1);
|
||||
vst1q_lane_f32( c + 2*rs_c + 2*cs_c, cv2, 2);
|
||||
vst1q_lane_f32( c + 3*rs_c + 2*cs_c, cv2, 3);
|
||||
|
||||
|
||||
// Store column 3
|
||||
vst1q_lane_f32( c + 0*rs_c + 3*cs_c, cv3, 0);
|
||||
vst1q_lane_f32( c + 1*rs_c + 3*cs_c, cv3, 1);
|
||||
|
||||
330
kernels/zen/1/bli_copyv_zen_int.c
Normal file
330
kernels/zen/1/bli_copyv_zen_int.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_scopyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 8;
|
||||
dim_t i = 0;
|
||||
__m256 xv[16];
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
#if 0
|
||||
PRAGMA_SIMD
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
y[i] = x[i];
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
memcpy(y, x, n << 2);
|
||||
#endif
|
||||
#if 1
|
||||
|
||||
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
|
||||
// for example if n = 255
|
||||
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
|
||||
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
|
||||
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
|
||||
for ( i = 0; i < (n & (~0x7F)); i += 128 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
|
||||
xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
|
||||
xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
|
||||
xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
|
||||
xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
|
||||
xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
|
||||
xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
|
||||
xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
|
||||
xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
|
||||
|
||||
y += 128;
|
||||
x += 128;
|
||||
}
|
||||
for ( ; i < (n & (~0x3F)); i += 64 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
|
||||
|
||||
y += 64;
|
||||
x += 64;
|
||||
}
|
||||
for ( ; i < (n & (~0x1F)); i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
|
||||
|
||||
y += 32;
|
||||
x += 32;
|
||||
}
|
||||
for ( ; i < (n & (~0x0F)); i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
|
||||
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
|
||||
|
||||
y += 16;
|
||||
x += 16;
|
||||
}
|
||||
for ( ; i < (n & (~0x07)); i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
|
||||
_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
|
||||
y += 8;
|
||||
x += 8;
|
||||
}
|
||||
for ( ; i < n; ++i )
|
||||
{
|
||||
*y++ = *x++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( dim_t i = 0; i < n; ++i )
|
||||
{
|
||||
*y = *x;
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_dcopyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 4;
|
||||
dim_t i = 0;
|
||||
__m256d xv[16];
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
#if 0
|
||||
PRAGMA_SIMD
|
||||
for (i = 0; i < n; ++i)
|
||||
{
|
||||
y[i] = x[i];
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
memcpy(y, x, n << 3);
|
||||
#endif
|
||||
#if 1
|
||||
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
|
||||
// the copy operation will be done for the multiples of 64
|
||||
for ( i = 0; i < (n & (~0x3F)); i += 64 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
|
||||
xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
|
||||
xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
|
||||
xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
|
||||
xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
|
||||
xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
|
||||
xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
|
||||
xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
|
||||
xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
|
||||
y += num_elem_per_reg * 16;
|
||||
x += num_elem_per_reg * 16;
|
||||
}
|
||||
for ( ; i < (n & (~0x1F)); i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
|
||||
xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
|
||||
xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
|
||||
xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
|
||||
|
||||
y += num_elem_per_reg * 8;
|
||||
x += num_elem_per_reg * 8;
|
||||
}
|
||||
for ( ; i < (n & (~0xF)); i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
|
||||
xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
|
||||
|
||||
y += num_elem_per_reg * 4;
|
||||
x += num_elem_per_reg * 4;
|
||||
}
|
||||
for ( ; i < (n & (~0x07)); i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
|
||||
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
|
||||
|
||||
y += num_elem_per_reg * 2;
|
||||
x += num_elem_per_reg * 2;
|
||||
}
|
||||
for ( ; i < (n & (~0x03)); i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
|
||||
_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
|
||||
y += num_elem_per_reg;
|
||||
x += num_elem_per_reg;
|
||||
}
|
||||
for ( ; i < n; ++i )
|
||||
{
|
||||
*y++ = *x++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
*y = *x;
|
||||
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -73,11 +73,11 @@ void bli_sdotv_zen_int10
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
|
||||
float rho0;
|
||||
float rho0 = 0.0;
|
||||
|
||||
__m256 xv[10];
|
||||
__m256 yv[10];
|
||||
v8sf_t rhov[2];
|
||||
v8sf_t rhov[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
@@ -96,8 +96,16 @@ void bli_sdotv_zen_int10
|
||||
{
|
||||
rhov[0].v = _mm256_setzero_ps();
|
||||
rhov[1].v = _mm256_setzero_ps();
|
||||
rhov[2].v = _mm256_setzero_ps();
|
||||
rhov[3].v = _mm256_setzero_ps();
|
||||
rhov[4].v = _mm256_setzero_ps();
|
||||
rhov[5].v = _mm256_setzero_ps();
|
||||
rhov[6].v = _mm256_setzero_ps();
|
||||
rhov[7].v = _mm256_setzero_ps();
|
||||
rhov[8].v = _mm256_setzero_ps();
|
||||
rhov[9].v = _mm256_setzero_ps();
|
||||
|
||||
for ( i = 0; (i + 79) < n; i += 80 )
|
||||
for ( i = 0 ; (i + 79) < n; i += 80 )
|
||||
{
|
||||
// 80 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
@@ -124,19 +132,25 @@ void bli_sdotv_zen_int10
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[1].v );
|
||||
rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
|
||||
rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
|
||||
rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
|
||||
rhov[5].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[5].v );
|
||||
rhov[6].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[6].v );
|
||||
rhov[7].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[7].v );
|
||||
rhov[8].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[8].v );
|
||||
rhov[9].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[9].v );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[5].v;
|
||||
rhov[1].v += rhov[6].v;
|
||||
rhov[2].v += rhov[7].v;
|
||||
rhov[3].v += rhov[8].v;
|
||||
rhov[4].v += rhov[9].v;
|
||||
|
||||
for ( ; (i + 39) < n; i += 40 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
@@ -153,34 +167,17 @@ void bli_sdotv_zen_int10
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
|
||||
rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v );
|
||||
rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v );
|
||||
rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 31) < n; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
rhov[0].v += rhov[2].v;
|
||||
rhov[1].v += rhov[3].v;
|
||||
rhov[0].v += rhov[4].v;
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
@@ -197,6 +194,8 @@ void bli_sdotv_zen_int10
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[1].v;
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
@@ -211,19 +210,15 @@ void bli_sdotv_zen_int10
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
rhov[0].f[0] += x0[i] * y0[i];
|
||||
rho0 += (*x0) * (*y0);
|
||||
x0 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
|
||||
v8sf_t onev;
|
||||
|
||||
onev.v = _mm256_set1_ps( 1.0f );
|
||||
|
||||
rhov[0].v = _mm256_dp_ps( rhov[0].v, onev.v, 0xf1 );
|
||||
rhov[1].v = _mm256_dp_ps( rhov[1].v, onev.v, 0xf1 );
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 += rhov[0].f[0] + rhov[0].f[4];
|
||||
rho0 += rhov[1].f[0] + rhov[1].f[4];
|
||||
rho0 += rhov[0].f[0] + rhov[0].f[1] +
|
||||
rhov[0].f[2] + rhov[0].f[3] +
|
||||
rhov[0].f[4] + rhov[0].f[5] +
|
||||
rhov[0].f[6] + rhov[0].f[7];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
@@ -269,11 +264,11 @@ void bli_ddotv_zen_int10
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
|
||||
double rho0;
|
||||
double rho0 = 0.0;
|
||||
|
||||
__m256d xv[10];
|
||||
__m256d yv[10];
|
||||
v4df_t rhov[2];
|
||||
v4df_t rhov[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
@@ -292,6 +287,14 @@ void bli_ddotv_zen_int10
|
||||
{
|
||||
rhov[0].v = _mm256_setzero_pd();
|
||||
rhov[1].v = _mm256_setzero_pd();
|
||||
rhov[2].v = _mm256_setzero_pd();
|
||||
rhov[3].v = _mm256_setzero_pd();
|
||||
rhov[4].v = _mm256_setzero_pd();
|
||||
rhov[5].v = _mm256_setzero_pd();
|
||||
rhov[6].v = _mm256_setzero_pd();
|
||||
rhov[7].v = _mm256_setzero_pd();
|
||||
rhov[8].v = _mm256_setzero_pd();
|
||||
rhov[9].v = _mm256_setzero_pd();
|
||||
|
||||
for ( i = 0; (i + 39) < n; i += 40 )
|
||||
{
|
||||
@@ -320,19 +323,25 @@ void bli_ddotv_zen_int10
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[1].v );
|
||||
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
|
||||
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
|
||||
rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
|
||||
rhov[5].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[5].v );
|
||||
rhov[6].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[6].v );
|
||||
rhov[7].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[7].v );
|
||||
rhov[8].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[8].v );
|
||||
rhov[9].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[9].v );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[5].v;
|
||||
rhov[1].v += rhov[6].v;
|
||||
rhov[2].v += rhov[7].v;
|
||||
rhov[3].v += rhov[8].v;
|
||||
rhov[4].v += rhov[9].v;
|
||||
|
||||
for ( ; (i + 19) < n; i += 20 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
@@ -349,14 +358,16 @@ void bli_ddotv_zen_int10
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
|
||||
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
|
||||
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
|
||||
rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[4].v;
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
@@ -371,13 +382,16 @@ void bli_ddotv_zen_int10
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v );
|
||||
rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[2].v;
|
||||
rhov[1].v += rhov[3].v;
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
@@ -393,6 +407,8 @@ void bli_ddotv_zen_int10
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
rhov[0].v += rhov[1].v;
|
||||
|
||||
for ( ; (i + 3) < n; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
@@ -407,12 +423,14 @@ void bli_ddotv_zen_int10
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
rhov[0].d[0] += x0[i] * y0[i];
|
||||
rho0 += (*x0) * (*y0);
|
||||
|
||||
x0 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
|
||||
rho0 += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3];
|
||||
rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
|
||||
@@ -80,9 +80,18 @@ void bli_sscalv_zen_int10
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
|
||||
float* zero = bli_s0;
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
bli_ssetv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#else
|
||||
ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -91,6 +100,7 @@ void bli_sscalv_zen_int10
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -270,8 +280,18 @@ void bli_dscalv_zen_int10
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
double* zero = bli_d0;
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
bli_dsetv_zen_int
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#else
|
||||
dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
@@ -281,6 +301,7 @@ void bli_dscalv_zen_int10
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
228
kernels/zen/1/bli_setv_zen_int.c
Normal file
228
kernels/zen/1/bli_setv_zen_int.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ssetv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 8;
|
||||
dim_t i = 0;
|
||||
__m256 alphav;
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( incx == 1 )
|
||||
{
|
||||
alphav = _mm256_broadcast_ss( alpha );
|
||||
|
||||
// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
|
||||
// for example if n = 255
|
||||
// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
|
||||
// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
|
||||
// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
|
||||
for ( i = 0; i < (n & (~0x7F)); i += 128 )
|
||||
{
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 8, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 9, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 10, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 11, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 12, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 13, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 14, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 15, alphav);
|
||||
|
||||
x += 128;
|
||||
}
|
||||
for ( ; i < (n & (~0x3F)); i += 64 )
|
||||
{
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 4, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 5, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 6, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 7, alphav);
|
||||
|
||||
x += 64;
|
||||
}
|
||||
for ( ; i < (n & (~0x1F)); i += 32 )
|
||||
{
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 3, alphav);
|
||||
|
||||
x += 32;
|
||||
}
|
||||
for ( ; i < (n & (~0x0F)); i += 16 )
|
||||
{
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 1, alphav);
|
||||
|
||||
x += 16;
|
||||
}
|
||||
for ( ; i < (n & (~0x07)); i += 8 )
|
||||
{
|
||||
_mm256_storeu_ps(x + num_elem_per_reg * 0, alphav);
|
||||
x += 8;
|
||||
}
|
||||
for ( ; i < n; ++i )
|
||||
{
|
||||
*x++ = *alpha;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( dim_t i = 0; i < n; ++i )
|
||||
{
|
||||
*x = *alpha;
|
||||
x += incx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bli_dsetv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t num_elem_per_reg = 4;
|
||||
dim_t i = 0;
|
||||
__m256d alphav;
|
||||
|
||||
// If the vector dimension is zero return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( incx == 1 )
|
||||
{
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_sd( alpha );
|
||||
|
||||
// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
|
||||
// the copy operation will be done for the multiples of 64
|
||||
for ( i = 0; i < (n & (~0x3F)); i += 64 )
|
||||
{
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 8, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 9, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 10, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 11, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 12, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 13, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 14, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 15, alphav);
|
||||
|
||||
x += num_elem_per_reg * 16;
|
||||
}
|
||||
for ( ; i < (n & (~0x1F)); i += 32 )
|
||||
{
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 4, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 5, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 6, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 7, alphav);
|
||||
|
||||
x += num_elem_per_reg * 8;
|
||||
}
|
||||
for ( ; i < (n & (~0xF)); i += 16 )
|
||||
{
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 2, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 3, alphav);
|
||||
|
||||
x += num_elem_per_reg * 4;
|
||||
}
|
||||
for ( ; i < (n & (~0x07)); i += 8 )
|
||||
{
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 1, alphav);
|
||||
|
||||
x += num_elem_per_reg * 2;
|
||||
}
|
||||
for ( ; i < (n & (~0x03)); i += 4 )
|
||||
{
|
||||
_mm256_storeu_pd(x + num_elem_per_reg * 0, alphav);
|
||||
x += num_elem_per_reg;
|
||||
}
|
||||
for ( ; i < n; ++i )
|
||||
{
|
||||
*x++ = *alpha;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
*x = *alpha;
|
||||
|
||||
x += incx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
344
kernels/zen/1/bli_swapv_zen_int8.c
Normal file
344
kernels/zen/1/bli_swapv_zen_int8.c
Normal file
@@ -0,0 +1,344 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sswapv_zen_int8
|
||||
(
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
dim_t i = 0;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
|
||||
__m256 xv[8];
|
||||
__m256 yv[8];
|
||||
|
||||
// If the vector dimension is zero, return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
for ( i = 0; ( i + 63 ) < n; i += 64 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
|
||||
_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]);
|
||||
_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]);
|
||||
_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]);
|
||||
_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]);
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
|
||||
_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]);
|
||||
_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]);
|
||||
_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]);
|
||||
_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]);
|
||||
|
||||
x0 += 8*n_elem_per_reg;
|
||||
y0 += 8*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 31 ) < n; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]);
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]);
|
||||
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]);
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]);
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 15 ) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 7 ) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
PASTEMAC(s,swaps)( x[i], y[i] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
PASTEMAC(s,swaps)( (*x0), (*y0) );
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
|
||||
void bli_dswapv_zen_int8
|
||||
(
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
dim_t i = 0;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
|
||||
__m256d xv[8];
|
||||
__m256d yv[8];
|
||||
|
||||
// If the vector dimension is zero, return early.
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
for ( ; ( i + 31 ) < n; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
|
||||
_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]);
|
||||
_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]);
|
||||
_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]);
|
||||
_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]);
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
|
||||
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]);
|
||||
_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]);
|
||||
_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]);
|
||||
_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]);
|
||||
|
||||
x0 += 8*n_elem_per_reg;
|
||||
y0 += 8*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 15 ) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]);
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]);
|
||||
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]);
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]);
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 7 ) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]);
|
||||
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]);
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; ( i + 3 ) < n; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]);
|
||||
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]);
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
PASTEMAC(d,swaps)( x[i], y[i] );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
PASTEMAC(d,swaps)( (*x0), (*y0) );
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
2303
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
Normal file
2303
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c
Normal file
File diff suppressed because it is too large
Load Diff
1756
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
Normal file
1756
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
Normal file
File diff suppressed because it is too large
Load Diff
1581
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
Normal file
1581
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
Normal file
File diff suppressed because it is too large
Load Diff
1658
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
Normal file
1658
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
Normal file
File diff suppressed because it is too large
Load Diff
1229
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
Normal file
1229
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
Normal file
File diff suppressed because it is too large
Load Diff
1196
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
Normal file
1196
kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
Normal file
File diff suppressed because it is too large
Load Diff
2668
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c
Normal file
2668
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c
Normal file
File diff suppressed because it is too large
Load Diff
1965
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
Normal file
1965
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
Normal file
File diff suppressed because it is too large
Load Diff
1869
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
Normal file
1869
kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
Normal file
File diff suppressed because it is too large
Load Diff
8745
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c
Normal file
8745
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c
Normal file
File diff suppressed because it is too large
Load Diff
2395
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c
Normal file
2395
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c
Normal file
File diff suppressed because it is too large
Load Diff
3887
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c
Normal file
3887
kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -32,6 +33,13 @@
|
||||
|
||||
*/
|
||||
|
||||
// -- level-1m --
|
||||
PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
|
||||
PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
|
||||
PACKM_KER_PROT(double, d, packm_8xk_nn_zen)
|
||||
PACKM_KER_PROT(double, d, packm_6xk_nn_zen)
|
||||
|
||||
|
||||
// -- level-1v --
|
||||
|
||||
// amaxv (intrinsics)
|
||||
@@ -42,17 +50,17 @@ AMAXV_KER_PROT( double, d, amaxv_zen_int )
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int )
|
||||
|
||||
// axpyv (intrinsics unrolled x10)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
|
||||
// axpyv (intrinsics unrolled x10)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
|
||||
|
||||
// dotv (intrinsics)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int )
|
||||
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
|
||||
// dotxv (intrinsics)
|
||||
DOTXV_KER_PROT( float, s, dotxv_zen_int )
|
||||
@@ -62,9 +70,21 @@ DOTXV_KER_PROT( double, d, dotxv_zen_int )
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int )
|
||||
SCALV_KER_PROT( double, d, scalv_zen_int )
|
||||
|
||||
// scalv (intrinsics unrolled x10)
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int10 )
|
||||
SCALV_KER_PROT( double, d, scalv_zen_int10 )
|
||||
// scalv (intrinsics unrolled x10)
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int10 )
|
||||
SCALV_KER_PROT( double, d, scalv_zen_int10 )
|
||||
|
||||
// swapv (intrinsics)
|
||||
SWAPV_KER_PROT(float, s, swapv_zen_int8 )
|
||||
SWAPV_KER_PROT(double, d, swapv_zen_int8 )
|
||||
|
||||
// copyv (intrinsics)
|
||||
COPYV_KER_PROT( float, s, copyv_zen_int )
|
||||
COPYV_KER_PROT( double, d, copyv_zen_int )
|
||||
|
||||
//
|
||||
SETV_KER_PROT(float, s, setv_zen_int)
|
||||
SETV_KER_PROT(double, d, setv_zen_int)
|
||||
|
||||
// -- level-1f --
|
||||
|
||||
@@ -76,3 +96,106 @@ AXPYF_KER_PROT( double, d, axpyf_zen_int_8 )
|
||||
DOTXF_KER_PROT( float, s, dotxf_zen_int_8 )
|
||||
DOTXF_KER_PROT( double, d, dotxf_zen_int_8 )
|
||||
|
||||
// -- level-3 sup --------------------------------------------------------------
|
||||
|
||||
// semmsup_rv
|
||||
|
||||
//GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 )
|
||||
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 )
|
||||
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 )
|
||||
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 )
|
||||
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 )
|
||||
|
||||
// gemmsup_rv (mkernel in m dim)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m )
|
||||
// gemmsup_rv (mkernel in n dim)
|
||||
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n )
|
||||
|
||||
// gemmsup_rd
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n)
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n)
|
||||
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 )
|
||||
|
||||
// gemmsup_rv (mkernel in n dim)
|
||||
|
||||
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 )
|
||||
|
||||
4
kernels/zen2/.gitignore
vendored
4
kernels/zen2/.gitignore
vendored
@@ -1,4 +0,0 @@
|
||||
# Ignore everything in this directory
|
||||
*
|
||||
# Except this file
|
||||
!.gitignore
|
||||
599
kernels/zen2/1f/bli_axpyf_zen_int_5.c
Normal file
599
kernels/zen2/1f/bli_axpyf_zen_int_5.c
Normal file
@@ -0,0 +1,599 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "immintrin.h"
|
||||
#include "blis.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
|
||||
void bli_saxpyf_zen_int_5
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 5;
|
||||
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 2;
|
||||
|
||||
dim_t i;
|
||||
|
||||
float* restrict a0;
|
||||
float* restrict a1;
|
||||
float* restrict a2;
|
||||
float* restrict a3;
|
||||
float* restrict a4;
|
||||
|
||||
float* restrict y0;
|
||||
|
||||
v8sf_t chi0v, chi1v, chi2v, chi3v;
|
||||
v8sf_t chi4v;
|
||||
|
||||
v8sf_t a00v, a01v, a02v, a03v;
|
||||
v8sf_t a04v;
|
||||
|
||||
v8sf_t a10v, a11v, a12v, a13v;
|
||||
v8sf_t a14v;
|
||||
|
||||
v8sf_t y0v, y1v;
|
||||
|
||||
float chi0, chi1, chi2, chi3;
|
||||
float chi4;
|
||||
|
||||
// If either dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* chi1 = x + (i )*incx;
|
||||
float* y1 = y + (0 )*incy;
|
||||
float alpha_chi1;
|
||||
|
||||
bli_scopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_sscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_saxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* chi1 = x + (i )*incx;
|
||||
float* y1 = y + (0 )*incy;
|
||||
float alpha_chi1;
|
||||
|
||||
bli_scopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_sscals( *alpha, alpha_chi1 );
|
||||
|
||||
f
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
y0 = y;
|
||||
|
||||
chi0 = *( x + 0*incx );
|
||||
chi1 = *( x + 1*incx );
|
||||
chi2 = *( x + 2*incx );
|
||||
chi3 = *( x + 3*incx );
|
||||
chi4 = *( x + 4*incx );
|
||||
|
||||
|
||||
// Scale each chi scalar by alpha.
|
||||
bli_sscals( *alpha, chi0 );
|
||||
bli_sscals( *alpha, chi1 );
|
||||
bli_sscals( *alpha, chi2 );
|
||||
bli_sscals( *alpha, chi3 );
|
||||
bli_sscals( *alpha, chi4 );
|
||||
|
||||
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
|
||||
chi0v.v = _mm256_broadcast_ss( &chi0 );
|
||||
chi1v.v = _mm256_broadcast_ss( &chi1 );
|
||||
chi2v.v = _mm256_broadcast_ss( &chi2 );
|
||||
chi3v.v = _mm256_broadcast_ss( &chi3 );
|
||||
chi4v.v = _mm256_broadcast_ss( &chi4 );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
if ( inca == 1 && incy == 1 )
|
||||
{
|
||||
for ( i = 0; (i + 15) < m; i += 16 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
|
||||
a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
|
||||
|
||||
a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
|
||||
a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
|
||||
|
||||
a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
|
||||
a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
|
||||
|
||||
a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
|
||||
a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
|
||||
|
||||
a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
|
||||
a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
|
||||
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
|
||||
|
||||
y0 += n_iter_unroll * n_elem_per_reg;
|
||||
a0 += n_iter_unroll * n_elem_per_reg;
|
||||
a1 += n_iter_unroll * n_elem_per_reg;
|
||||
a2 += n_iter_unroll * n_elem_per_reg;
|
||||
a3 += n_iter_unroll * n_elem_per_reg;
|
||||
a4 += n_iter_unroll * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for( ; (i + 7) < m; i += 8 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
|
||||
a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
|
||||
a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
|
||||
a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
|
||||
a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
|
||||
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
|
||||
y0 += n_elem_per_reg;
|
||||
a0 += n_elem_per_reg;
|
||||
a1 += n_elem_per_reg;
|
||||
a2 += n_elem_per_reg;
|
||||
a3 += n_elem_per_reg;
|
||||
a4 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( ; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const float a0c = *a0;
|
||||
const float a1c = *a1;
|
||||
const float a2c = *a2;
|
||||
const float a3c = *a3;
|
||||
const float a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += 1;
|
||||
a1 += 1;
|
||||
a2 += 1;
|
||||
a3 += 1;
|
||||
a4 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const float a0c = *a0;
|
||||
const float a1c = *a1;
|
||||
const float a2c = *a2;
|
||||
const float a3c = *a3;
|
||||
const float a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_daxpyf_zen_int_5
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 5;
|
||||
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 2;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double* restrict a0;
|
||||
double* restrict a1;
|
||||
double* restrict a2;
|
||||
double* restrict a3;
|
||||
double* restrict a4;
|
||||
|
||||
double* restrict y0;
|
||||
|
||||
v4df_t chi0v, chi1v, chi2v, chi3v;
|
||||
v4df_t chi4v;
|
||||
|
||||
v4df_t a00v, a01v, a02v, a03v;
|
||||
v4df_t a04v;
|
||||
|
||||
v4df_t a10v, a11v, a12v, a13v;
|
||||
v4df_t a14v;
|
||||
|
||||
v4df_t y0v, y1v;
|
||||
|
||||
double chi0, chi1, chi2, chi3;
|
||||
double chi4;
|
||||
|
||||
// If either dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
#ifdef BLIS_CONFIG_ZEN2
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_dscals( *alpha, alpha_chi1 );
|
||||
|
||||
bli_daxpyv_zen_int10
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#else
|
||||
daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
bli_dcopycjs( conjx, *chi1, alpha_chi1 );
|
||||
bli_dscals( *alpha, alpha_chi1 );
|
||||
|
||||
f
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
y0 = y;
|
||||
|
||||
chi0 = *( x + 0*incx );
|
||||
chi1 = *( x + 1*incx );
|
||||
chi2 = *( x + 2*incx );
|
||||
chi3 = *( x + 3*incx );
|
||||
chi4 = *( x + 4*incx );
|
||||
|
||||
|
||||
// Scale each chi scalar by alpha.
|
||||
bli_dscals( *alpha, chi0 );
|
||||
bli_dscals( *alpha, chi1 );
|
||||
bli_dscals( *alpha, chi2 );
|
||||
bli_dscals( *alpha, chi3 );
|
||||
bli_dscals( *alpha, chi4 );
|
||||
|
||||
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
|
||||
chi0v.v = _mm256_broadcast_sd( &chi0 );
|
||||
chi1v.v = _mm256_broadcast_sd( &chi1 );
|
||||
chi2v.v = _mm256_broadcast_sd( &chi2 );
|
||||
chi3v.v = _mm256_broadcast_sd( &chi3 );
|
||||
chi4v.v = _mm256_broadcast_sd( &chi4 );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
if ( inca == 1 && incy == 1 )
|
||||
{
|
||||
for ( i = 0; (i + 7) < m; i += 8 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
|
||||
|
||||
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
|
||||
|
||||
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
|
||||
|
||||
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
|
||||
|
||||
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
|
||||
|
||||
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
|
||||
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
|
||||
|
||||
y0 += n_iter_unroll * n_elem_per_reg;
|
||||
a0 += n_iter_unroll * n_elem_per_reg;
|
||||
a1 += n_iter_unroll * n_elem_per_reg;
|
||||
a2 += n_iter_unroll * n_elem_per_reg;
|
||||
a3 += n_iter_unroll * n_elem_per_reg;
|
||||
a4 += n_iter_unroll * n_elem_per_reg;
|
||||
}
|
||||
|
||||
for( ; (i + 3) < m; i += 4 )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
|
||||
y0 += n_elem_per_reg;
|
||||
a0 += n_elem_per_reg;
|
||||
a1 += n_elem_per_reg;
|
||||
a2 += n_elem_per_reg;
|
||||
a3 += n_elem_per_reg;
|
||||
a4 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( ; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += 1;
|
||||
a1 += 1;
|
||||
a2 += 1;
|
||||
a3 += 1;
|
||||
a4 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; (i + 0) < m ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
40
kernels/zen2/bli_kernels_zen2.h
Normal file
40
kernels/zen2/bli_kernels_zen2.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// -- level-1f --
|
||||
|
||||
AXPYF_KER_PROT( float, s, axpyf_zen_int_5 )
|
||||
AXPYF_KER_PROT( double, d, axpyf_zen_int_5 )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -463,7 +463,8 @@ void GENBARNAME(cntx_init)
|
||||
// operation.
|
||||
|
||||
// Set the gemm slot to the default gemm sup handler.
|
||||
vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref;
|
||||
vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref;
|
||||
vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
|
||||
|
||||
|
||||
// -- Set level-3 small/unpacked micro-kernels and preferences -------------
|
||||
|
||||
173
test/Makefile
173
test/Makefile
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
# Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -97,16 +97,11 @@ endif
|
||||
BLAS_LIB_PATH := $(HOME)/flame/lib
|
||||
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
|
||||
#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
MKL_LIB_PATH := ${MKLROOT}/lib/intel64
|
||||
#ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
|
||||
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
|
||||
|
||||
# OpenBLAS
|
||||
OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a
|
||||
|
||||
# ATLAS
|
||||
ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \
|
||||
$(BLAS_LIB_PATH)/libatlas.a
|
||||
|
||||
# MKL
|
||||
MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_intel_lp64 \
|
||||
@@ -114,18 +109,6 @@ MKL_LIB := -L$(MKL_LIB_PATH) \
|
||||
-lmkl_sequential \
|
||||
-lpthread -lm -ldl
|
||||
|
||||
# ESSL
|
||||
# Note: ESSL is named differently for SMP and/or BG
|
||||
#ESSL_TYPE := # This is the 32b library on POWER
|
||||
#ESSL_TYPE := 6464 # This is the 64b library on POWER
|
||||
#ESSL_TYPE := bg # This is the 32b single-threaded library on Blue Gene
|
||||
#ESSL_TYPE := smpbg # This is the 32b multi-threaded library on Blue Gene
|
||||
#ESSL_LIB := $(ESSL_LIB_PATH)/libessl$(ESSL_TYPE).a
|
||||
|
||||
# Accelerate
|
||||
MAC_LIB := -framework Accelerate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- General build definitions ------------------------------------------------
|
||||
@@ -159,121 +142,32 @@ CFLAGS += -I$(TEST_SRC_PATH)
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
# Complete list of possible targets when defining 'all':
|
||||
#
|
||||
# blis openblas atlas mkl mac essl
|
||||
#
|
||||
#all: blis openblas atlas mkl
|
||||
# Define the operations we will test.
|
||||
TEST_OPS := dotv axpyv \
|
||||
gemv ger hemv her her2 trmv trsv \
|
||||
gemm hemm herk her2k trmm trsm
|
||||
|
||||
# Optionally test gemmt, which some libraries might not implement.
|
||||
ifeq ($(BUILD_GEMMT),yes)
|
||||
TEST_OPS := $(TEST_OPS) gemmt
|
||||
endif
|
||||
|
||||
# Define a function to create the executable names.
|
||||
test-bins = $(foreach op, $(TEST_OPS), test_$(op)_$(1).x)
|
||||
|
||||
# Create the list of executables for each implementation.
|
||||
TEST_BINS_BLIS := $(call test-bins,blis)
|
||||
TEST_BINS_OPENBLAS := $(call test-bins,openblas)
|
||||
TEST_BINS_MKL := $(call test-bins,mkl)
|
||||
|
||||
|
||||
all: blis openblas mkl
|
||||
|
||||
blis: check-env \
|
||||
test_dotv_blis.x \
|
||||
test_axpyv_blis.x \
|
||||
test_gemv_blis.x \
|
||||
test_ger_blis.x \
|
||||
test_hemv_blis.x \
|
||||
test_her_blis.x \
|
||||
test_her2_blis.x \
|
||||
test_trmv_blis.x \
|
||||
test_trsv_blis.x \
|
||||
\
|
||||
test_gemm_blis.x \
|
||||
test_hemm_blis.x \
|
||||
test_herk_blis.x \
|
||||
test_her2k_blis.x \
|
||||
test_trmm_blis.x \
|
||||
test_trsm_blis.x
|
||||
blis: check-env $(TEST_BINS_BLIS)
|
||||
|
||||
openblas: check-env \
|
||||
test_dotv_openblas.x \
|
||||
test_axpyv_openblas.x \
|
||||
test_gemv_openblas.x \
|
||||
test_ger_openblas.x \
|
||||
test_hemv_openblas.x \
|
||||
test_her_openblas.x \
|
||||
test_her2_openblas.x \
|
||||
test_trmv_openblas.x \
|
||||
test_trsv_openblas.x \
|
||||
\
|
||||
test_gemm_openblas.x \
|
||||
test_hemm_openblas.x \
|
||||
test_herk_openblas.x \
|
||||
test_her2k_openblas.x \
|
||||
test_trmm_openblas.x \
|
||||
test_trsm_openblas.x
|
||||
|
||||
atlas: check-env \
|
||||
test_dotv_atlas.x \
|
||||
test_axpyv_atlas.x \
|
||||
test_gemv_atlas.x \
|
||||
test_ger_atlas.x \
|
||||
test_hemv_atlas.x \
|
||||
test_her_atlas.x \
|
||||
test_her2_atlas.x \
|
||||
test_trmv_atlas.x \
|
||||
test_trsv_atlas.x \
|
||||
\
|
||||
test_gemm_atlas.x \
|
||||
test_hemm_atlas.x \
|
||||
test_herk_atlas.x \
|
||||
test_her2k_atlas.x \
|
||||
test_trmm_atlas.x \
|
||||
test_trsm_atlas.x
|
||||
|
||||
mkl: check-env \
|
||||
test_dotv_mkl.x \
|
||||
test_axpyv_mkl.x \
|
||||
test_gemv_mkl.x \
|
||||
test_ger_mkl.x \
|
||||
test_hemv_mkl.x \
|
||||
test_her_mkl.x \
|
||||
test_her2_mkl.x \
|
||||
test_trmv_mkl.x \
|
||||
test_trsv_mkl.x \
|
||||
\
|
||||
test_gemm_mkl.x \
|
||||
test_hemm_mkl.x \
|
||||
test_herk_mkl.x \
|
||||
test_her2k_mkl.x \
|
||||
test_trmm_mkl.x \
|
||||
test_trsm_mkl.x
|
||||
|
||||
essl: check-env \
|
||||
test_dotv_essl.x \
|
||||
test_axpyv_essl.x \
|
||||
test_gemv_essl.x \
|
||||
test_ger_essl.x \
|
||||
test_hemv_essl.x \
|
||||
test_her_essl.x \
|
||||
test_her2_essl.x \
|
||||
test_trmv_essl.x \
|
||||
test_trsv_essl.x \
|
||||
\
|
||||
test_gemm_essl.x \
|
||||
test_hemm_essl.x \
|
||||
test_herk_essl.x \
|
||||
test_her2k_essl.x \
|
||||
test_trmm_essl.x \
|
||||
test_trsm_essl.x
|
||||
|
||||
mac: check-env \
|
||||
test_dotv_mac.x \
|
||||
test_axpyv_mac.x \
|
||||
test_gemv_mac.x \
|
||||
test_ger_mac.x \
|
||||
test_hemv_mac.x \
|
||||
test_her_mac.x \
|
||||
test_her2_mac.x \
|
||||
test_trmv_mac.x \
|
||||
test_trsv_mac.x \
|
||||
\
|
||||
test_gemm_mac.x \
|
||||
test_hemm_mac.x \
|
||||
test_herk_mac.x \
|
||||
test_her2k_mac.x \
|
||||
test_trmm_mac.x \
|
||||
test_trsm_mac.x
|
||||
openblas: check-env $(TEST_BINS_OPENBLAS)
|
||||
|
||||
mkl: check-env $(TEST_BINS_MKL)
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
@@ -281,21 +175,13 @@ mac: check-env \
|
||||
$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
|
||||
test_%_openblas.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLAS=\"openblas\" -c $< -o $@
|
||||
|
||||
test_%_atlas.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLAS=\"atlas\" -c $< -o $@
|
||||
|
||||
test_%_mkl.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLAS=\"mkl\" -c $< -o $@
|
||||
|
||||
test_%_essl.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLAS=\"essl\" -c $< -o $@
|
||||
|
||||
test_%_mac.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLAS=\"mac\" -c $< -o $@
|
||||
|
||||
test_%_blis.o: test_%.c
|
||||
$(CC) $(CFLAGS) -DBLIS -c $< -o $@
|
||||
|
||||
@@ -310,18 +196,9 @@ test_%_blis.o: test_%.c
|
||||
test_%_openblas.x: test_%_openblas.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_atlas.x: test_%_atlas.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(ATLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_mkl.x: test_%_mkl.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_essl.x: test_%_essl.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(ESSL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_mac.x: test_%_mac.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(MAC_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_blis.x: test_%_blis.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
|
||||
218
test/other/test_copyv.c
Normal file
218
test/other/test_copyv.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
//#define BLIS_ACCURACY_TEST
|
||||
#ifdef BLIS_ACCURACY_TEST
|
||||
|
||||
bool_t scompare_result(int n, float *x, int incx, float *y, int incy) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if ((*x) != (*y)) {
|
||||
printf("%4f != %4f at location %d\n", *x, *y, i);
|
||||
return FALSE;
|
||||
}
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t dcompare_result(int n, double *x, int incx, double *y, int incy) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if ((*x) != (*y)) {
|
||||
printf("%4f != %4f at location %d\n", *x, *y, i);
|
||||
return FALSE;
|
||||
}
|
||||
x += incx;
|
||||
y += incy;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
obj_t x, y;
|
||||
dim_t n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input, sizeof_dt;
|
||||
int r, n_repeats;
|
||||
num_t dt;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double Gbps;
|
||||
|
||||
//bli_init();
|
||||
|
||||
n_repeats = 100000;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 100000;
|
||||
p_inc = 200;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = 16;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
// dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
if (dt == BLIS_DOUBLE)
|
||||
sizeof_dt = sizeof(double);
|
||||
else if (dt == BLIS_FLOAT)
|
||||
sizeof_dt = sizeof(float);
|
||||
|
||||
printf("executable\t n\t GBs per sec\n");
|
||||
for (p = p_begin; p <= p_end; p += p_inc)
|
||||
{
|
||||
|
||||
if (n_input < 0) n = p * (dim_t)abs(n_input);
|
||||
else n = (dim_t)n_input;
|
||||
|
||||
bli_obj_create(dt, n, 1, 0, 0, &x);
|
||||
bli_obj_create(dt, n, 1, 0, 0, &y);
|
||||
bli_randm(&x);
|
||||
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for (r = 0; r < n_repeats; ++r)
|
||||
{
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef BLIS
|
||||
bli_copyv(&x,
|
||||
&y
|
||||
);
|
||||
#else
|
||||
if (bli_is_float(dt))
|
||||
{
|
||||
f77_int nn = bli_obj_length(&x);
|
||||
f77_int incx = bli_obj_vector_inc(&x);
|
||||
float* xp = bli_obj_buffer(&x);
|
||||
f77_int incy = bli_obj_vector_inc(&y);
|
||||
float* yp = bli_obj_buffer(&y);
|
||||
|
||||
scopy_(&nn,
|
||||
xp, &incx,
|
||||
yp, &incy);
|
||||
|
||||
}
|
||||
else if (bli_is_double(dt))
|
||||
{
|
||||
|
||||
f77_int nn = bli_obj_length(&x);
|
||||
f77_int incx = bli_obj_vector_inc(&x);
|
||||
double* xp = bli_obj_buffer(&x);
|
||||
f77_int incy = bli_obj_vector_inc(&y);
|
||||
double* yp = bli_obj_buffer(&y);
|
||||
|
||||
dcopy_(&nn,
|
||||
xp, &incx,
|
||||
yp, &incy
|
||||
);
|
||||
}
|
||||
#endif
|
||||
dtime_save = bli_clock_min_diff(dtime_save, dtime);
|
||||
#ifdef BLIS_ACCURACY_TEST
|
||||
if (dt == BLIS_FLOAT) {
|
||||
int nn = bli_obj_length(&x);
|
||||
int incx = bli_obj_vector_inc(&x);
|
||||
float* xp = bli_obj_buffer(&x);
|
||||
int incy = bli_obj_vector_inc(&y);
|
||||
float* yp = bli_obj_buffer(&y);
|
||||
if (scompare_result(nn, xp, incx, yp, incy))
|
||||
printf("Copy Successful\n");
|
||||
else
|
||||
printf("ALERT!!! Copy Failed\n");
|
||||
}
|
||||
if (dt == BLIS_DOUBLE) {
|
||||
int nn = bli_obj_length(&x);
|
||||
int incx = bli_obj_vector_inc(&x);
|
||||
double* xp = bli_obj_buffer(&x);
|
||||
int incy = bli_obj_vector_inc(&y);
|
||||
double* yp = bli_obj_buffer(&y);
|
||||
if (dcompare_result(nn, xp, incx, yp, incy))
|
||||
printf("Copy Successful\n");
|
||||
else
|
||||
printf("ALERT!!! Copy Failed\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
|
||||
if (p >= 1000)
|
||||
p_inc = 1000;
|
||||
|
||||
if (p >= 10000)
|
||||
p_inc = 10000;
|
||||
Gbps = (n * sizeof_dt) / (dtime_save * 1.0e9);
|
||||
#ifdef BLIS
|
||||
printf("data_copyv_blis\t");
|
||||
#else
|
||||
printf("data_copyv_%s\t", BLAS);
|
||||
#endif
|
||||
printf("%4lu\t %7.2f\n",
|
||||
(unsigned long)n, Gbps);
|
||||
|
||||
bli_obj_free(&x);
|
||||
bli_obj_free(&y);
|
||||
}
|
||||
|
||||
// bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
392
test/other/test_gemm.c
Normal file
392
test/other/test_gemm.c
Normal file
@@ -0,0 +1,392 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
//#define FILE_IN_OUT
|
||||
//#define PRINT
|
||||
//#define MATRIX_INITIALISATION
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, n, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, n_input, k_input;
|
||||
num_t dt;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
#ifdef FILE_IN_OUT
|
||||
FILE* fin = NULL;
|
||||
FILE* fout = NULL;
|
||||
char gemm = 's';
|
||||
|
||||
#endif
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 2000;
|
||||
p_inc = 200;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
m_input = 5;
|
||||
k_input = 6;
|
||||
n_input = 4;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
//dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
if (argc < 3)
|
||||
{
|
||||
printf("Usage: ./test_gemm_XX.x input.csv output.csv\n");
|
||||
exit(1);
|
||||
}
|
||||
fin = fopen(argv[1], "r");
|
||||
if (fin == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
fout = fopen(argv[2], "w");
|
||||
if (fout == NULL)
|
||||
{
|
||||
printf("Error opening output file %s\n", argv[2]);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(fout, "m\t k\t n\t cs_a\t cs_b\t cs_c\t gflops\t GEMM_Algo\n");
|
||||
|
||||
|
||||
printf("~~~~~~~~~~_BLAS\t m\t k\t n\t cs_a\t cs_b\t cs_c \t gflops\t GEMM_Algo\n");
|
||||
|
||||
inc_t cs_a;
|
||||
inc_t cs_b;
|
||||
inc_t cs_c;
|
||||
|
||||
while (fscanf(fin, "%lld %lld %lld %lld %lld %lld\n", &m, &k, &n, &cs_a, &cs_b, &cs_c) == 6)
|
||||
{
|
||||
if ((m > cs_a) || (k > cs_b) || (m > cs_c)) continue; // leading dimension should be greater than number of rows
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, m, k, 1, cs_a, &a );
|
||||
bli_obj_create( dt, k, n, 1, cs_b, &b );
|
||||
bli_obj_create( dt, m, n, 1, cs_c, &c );
|
||||
bli_obj_create( dt, m, n, 1, cs_c, &c_save );
|
||||
#ifdef MATRIX_INITIALISATION
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
#endif
|
||||
bli_obj_set_conjtrans( transa, &a);
|
||||
bli_obj_set_conjtrans( transb, &b);
|
||||
|
||||
//bli_setsc( 0.0, -1, &alpha );
|
||||
//bli_setsc( 0.0, 1, &beta );
|
||||
|
||||
bli_setsc( -1, 0.0, &alpha );
|
||||
bli_setsc( 1, 0.0, &beta );
|
||||
|
||||
#else
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, k, n, 0, 0, &b );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (0.9/1.0), 0.2, &alpha );
|
||||
bli_setsc( -(1.1/1.0), 0.3, &beta );
|
||||
|
||||
#endif
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_gemm( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
sgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
zgemm_( &f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * m * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_gemm_blis" );
|
||||
#else
|
||||
printf( "data_gemm_%s", BLAS );
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
|
||||
if ( bli_is_double( dt ) ) {
|
||||
|
||||
if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES/4)) || ((m < (BLIS_SMALL_M_RECT_MATRIX_THRES/2) ) && (k < (BLIS_SMALL_K_RECT_MATRIX_THRES/2) )))
|
||||
gemm = 'S'; // small gemm
|
||||
else gemm = 'N'; // Normal blis gemm
|
||||
|
||||
}
|
||||
else if (bli_is_float( dt )) {
|
||||
if (((m * n) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) || ((m < BLIS_SMALL_M_RECT_MATRIX_THRES) && (k < BLIS_SMALL_K_RECT_MATRIX_THRES)))
|
||||
gemm = 'S'; // small gemm
|
||||
else gemm = 'N'; // normal blis gemm
|
||||
}
|
||||
|
||||
|
||||
|
||||
printf("%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops, gemm );
|
||||
|
||||
|
||||
fprintf(fout, "%6lu \t %4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f \t %c\n", \
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops, gemm );
|
||||
fflush(fout);
|
||||
|
||||
#else
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k,
|
||||
( unsigned long )n, gflops );
|
||||
#endif
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
#ifdef FILE_IN_OUT
|
||||
fclose(fin);
|
||||
fclose(fout);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
154
test/other/test_scalv.c
Normal file
154
test/other/test_scalv.c
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
obj_t a, alpha;
|
||||
dim_t n, p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input;
|
||||
num_t dt;
|
||||
int r, n_repeats;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 100000;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 100000;
|
||||
p_inc = 200;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = 4;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dt = BLIS_FLOAT;
|
||||
//dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
#ifdef BLIS
|
||||
printf( "data_scalv_blis\t n\t gflops\n" );
|
||||
#else
|
||||
printf( "data_scalv_%s\t n\t gflops\n", BLAS );
|
||||
#endif
|
||||
|
||||
for (p = p_begin; p <= p_end; p += p_inc)
|
||||
{
|
||||
if (n_input < 0) n = p * (dim_t)abs(n_input);
|
||||
else n = (dim_t)n_input;
|
||||
|
||||
|
||||
bli_obj_create(dt, 1, 1, 0, 0, &alpha);
|
||||
bli_obj_create(dt, 1, n, 0, 0, &a);
|
||||
|
||||
bli_randm(&a);
|
||||
bli_setsc((2.0), 0.0, &alpha);
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for (r = 0; r < n_repeats; ++r)
|
||||
{
|
||||
dtime = bli_clock();
|
||||
#ifdef BLIS
|
||||
bli_scalm(&BLIS_TWO, &a);
|
||||
#else
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int nn = bli_obj_length( &a );
|
||||
f77_int inca = bli_obj_vector_inc( &a );
|
||||
float* scalar = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
|
||||
sscal_( &nn, scalar,
|
||||
ap, &inca );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int nn = bli_obj_length( &a );
|
||||
f77_int inca = bli_obj_vector_inc( &a );
|
||||
double* scalar = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
|
||||
dscal_( &nn, scalar,
|
||||
ap, &inca );
|
||||
}
|
||||
#endif
|
||||
dtime_save = bli_clock_min_diff(dtime_save, dtime);
|
||||
}
|
||||
// Size of the vectors are incrementd by 1000, to test wide range of inputs.
|
||||
if (p == 10000)
|
||||
p_inc = 10000;
|
||||
|
||||
if (p == 1000)
|
||||
p_inc = 1000;
|
||||
|
||||
gflops = n / (dtime_save * 1.0e9);
|
||||
#ifdef BLIS
|
||||
printf( "data_scalv_blis\t" );
|
||||
#else
|
||||
printf( "data_scalv_%s\t", BLAS );
|
||||
#endif
|
||||
printf(" %4lu\t %7.2f \n",
|
||||
(unsigned long)n, gflops);
|
||||
|
||||
bli_obj_free(&alpha);
|
||||
bli_obj_free(&a);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
185
test/other/test_swapv.c
Normal file
185
test/other/test_swapv.c
Normal file
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// n x incx y incy
|
||||
//void dswap_( int*, double*, int*, double*, int* );
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t x, y;
|
||||
dim_t n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input;
|
||||
int r, n_repeats;
|
||||
num_t dt;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
bli_init();
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 40;
|
||||
p_end = 8000;
|
||||
p_inc = 40;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = -1;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dt = BLIS_FLOAT;
|
||||
//dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
|
||||
#ifdef BLIS
|
||||
printf( "data_swapv_blis" );
|
||||
#else
|
||||
printf( "data_swapv_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
//for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
for ( p = p_end; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
bli_obj_create( dt, n, 1, 0, 0, &x );
|
||||
bli_obj_create( dt, n, 1, 0, 0, &y );
|
||||
|
||||
bli_randm( &x );
|
||||
bli_randm( &y );
|
||||
|
||||
dtime_save = 1.0e9;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "x", &x, "%4.1f", "" );
|
||||
bli_printm( "y", &y, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_swapv( &x,
|
||||
&y
|
||||
);
|
||||
#else
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int nn = bli_obj_length( &x );
|
||||
f77_int incx = bli_obj_vector_inc( &x );
|
||||
f77_int incy = bli_obj_vector_inc( &y );
|
||||
float* xp = bli_obj_buffer( &x );
|
||||
float* yp = bli_obj_buffer( &y );
|
||||
|
||||
sswap_( &nn,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
|
||||
f77_int nn = bli_obj_length( &x );
|
||||
f77_int incx = bli_obj_vector_inc( &x );
|
||||
f77_int incy = bli_obj_vector_inc( &y );
|
||||
double* xp = bli_obj_buffer( &x );
|
||||
double* yp = bli_obj_buffer( &y );
|
||||
|
||||
dswap_( &nn,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "X after", &x, "%4.1f", "" );
|
||||
bli_printm( "Y after", &y, "%4.1f", "" );
|
||||
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_swapv_blis" );
|
||||
#else
|
||||
printf( "data_swapv_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &x );
|
||||
bli_obj_free( &y );
|
||||
}
|
||||
|
||||
bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
443
test/other/test_trsm.c
Normal file
443
test/other/test_trsm.c
Normal file
@@ -0,0 +1,443 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
//#define FILE_IN_OUT
|
||||
#ifdef FILE_IN_OUT
|
||||
//#define READ_ALL_PARAMS_FROM_FILE
|
||||
#endif
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha;
|
||||
dim_t m, n;
|
||||
num_t dt;
|
||||
int r, n_repeats;
|
||||
side_t side;
|
||||
uplo_t uploa;
|
||||
trans_t transa;
|
||||
diag_t diaga;
|
||||
f77_char f77_side;
|
||||
f77_char f77_uploa;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_diaga;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
FILE* fin = NULL;
|
||||
FILE* fout = NULL;
|
||||
#else
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, n_input;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 2000;
|
||||
p_inc = 200;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
m_input = 4;
|
||||
n_input = 4;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#if 1
|
||||
//dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
if(argc < 3)
|
||||
{
|
||||
printf("Usage: ./test_trsm_XX.x input.csv output.csv\n");
|
||||
exit(1);
|
||||
}
|
||||
fin = fopen(argv[1], "r");
|
||||
if(fin == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fout = fopen(argv[2], "w");
|
||||
if(fout == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[2]);
|
||||
exit(1);
|
||||
}
|
||||
inc_t cs_a;
|
||||
inc_t cs_b;
|
||||
#ifdef READ_ALL_PARAMS_FROM_FILE
|
||||
char side_c, uploa_c, transa_c, diaga_c;
|
||||
|
||||
fprintf(fout, "side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
|
||||
|
||||
printf("~~~~~~~_BLAS\t side, uploa, transa, diaga, m\t n\t cs_a\t cs_b\t gflops\n");
|
||||
|
||||
while(fscanf(fin, "%c %c %c %c %ld %ld %ld %ld\n", &side_c, &uploa_c, &transa_c, &diaga_c, &m, &n, &cs_a, &cs_b) == 8)
|
||||
{
|
||||
|
||||
if( 'l' == side_c|| 'L' == side_c)
|
||||
side = BLIS_LEFT;
|
||||
else if('r' == side_c || 'R' == side_c)
|
||||
side = BLIS_RIGHT;
|
||||
else
|
||||
{
|
||||
printf("Invalid entry for the argument 'side':%c\n",side_c);
|
||||
continue;
|
||||
}
|
||||
|
||||
if('l' == uploa_c || 'L' == uploa_c)
|
||||
uploa = BLIS_LOWER;
|
||||
else if('u' == uploa_c || 'U' == uploa_c)
|
||||
uploa = BLIS_UPPER;
|
||||
else
|
||||
{
|
||||
printf("Invalid entry for the argument 'uplo':%c\n",uploa_c);
|
||||
continue;
|
||||
}
|
||||
|
||||
if('t' == transa_c || 'T' == transa_c)
|
||||
transa = BLIS_TRANSPOSE;
|
||||
else if('n' == transa_c || 'N' == transa_c)
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
else
|
||||
{
|
||||
printf("Invalid entry for the argument 'transa':%c\n",transa_c);
|
||||
continue;
|
||||
}
|
||||
|
||||
if('u' == diaga_c || 'U' == diaga_c)
|
||||
diaga = BLIS_UNIT_DIAG;
|
||||
else if('n' == diaga_c || 'N' == diaga_c)
|
||||
diaga = BLIS_NONUNIT_DIAG;
|
||||
else
|
||||
{
|
||||
printf("Invalid entry for the argument 'diaga':%c\n", diaga_c);
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
|
||||
fprintf(fout, "m\t n\t cs_a\t cs_b\t gflops\n");
|
||||
|
||||
printf("~~~~~~~_BLAS\t m\t n\t cs_a\t cs_b\t gflops\n");
|
||||
|
||||
while(fscanf(fin, "%ld %ld %ld %ld\n", &m, &n, &cs_a, &cs_b) == 4)
|
||||
{
|
||||
|
||||
side = BLIS_LEFT;
|
||||
//side = BLIS_RIGHT;
|
||||
|
||||
uploa = BLIS_LOWER;
|
||||
//uploa = BLIS_UPPER;
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
|
||||
diaga = BLIS_NONUNIT_DIAG;
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
bli_param_map_blis_to_netlib_side( side, &f77_side );
|
||||
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
|
||||
|
||||
if(bli_is_left(side) && ((m > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
|
||||
|
||||
if(bli_is_right(side) && ((n > cs_a) || (m > cs_b))) continue; //leading dimension should be greater than number of rows
|
||||
|
||||
if ( bli_is_left( side ) )
|
||||
bli_obj_create( dt, m, m, 1, m, &a );
|
||||
else
|
||||
bli_obj_create( dt, n, n, 1, n, &a );
|
||||
bli_obj_create( dt, m, n, 1, m, &c );
|
||||
bli_obj_create( dt, m, n, 1, m, &c_save );
|
||||
|
||||
#else
|
||||
|
||||
for ( p = p_end; p >= p_begin; p -= p_inc )
|
||||
{
|
||||
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
|
||||
side = BLIS_LEFT;
|
||||
//side = BLIS_RIGHT;
|
||||
|
||||
uploa = BLIS_LOWER;
|
||||
//uploa = BLIS_UPPER;
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
|
||||
diaga = BLIS_NONUNIT_DIAG;
|
||||
|
||||
bli_param_map_blis_to_netlib_side( side, &f77_side );
|
||||
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
|
||||
|
||||
if ( bli_is_left( side ) )
|
||||
bli_obj_create( dt, m, m, 0, 0, &a );
|
||||
else
|
||||
bli_obj_create( dt, n, n, 0, 0, &a );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
|
||||
bli_obj_set_uplo( uploa, &a );
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_diag( diaga, &a );
|
||||
|
||||
// Randomize A and zero the unstored triangle to ensure the
|
||||
// implementation reads only from the stored region.
|
||||
bli_randm( &a );
|
||||
bli_mktrim( &a );
|
||||
|
||||
// Load the diagonal of A to make it more likely to be invertible.
|
||||
bli_shiftd( &BLIS_TWO, &a );
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_setsc( (2.0/1.0), 1.0, &alpha );
|
||||
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_invertd( &a );
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_invertd( &a );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_trsm( side,
|
||||
&alpha,
|
||||
&a,
|
||||
&c );
|
||||
#else
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
strsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&nn,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dtrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&nn,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ctrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&nn,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ztrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&nn,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%9.5f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
if ( bli_is_left( side ) )
|
||||
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
|
||||
else
|
||||
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_trsm_blis" );
|
||||
#else
|
||||
printf( "data_trsm_%s", BLAS );
|
||||
#endif
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
#ifdef READ_ALL_PARAMS_FROM_FILE
|
||||
|
||||
printf("%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n",side_c, uploa_c, transa_c, diaga_c,
|
||||
(unsigned long )m, (unsigned long ) n,
|
||||
(unsigned long )cs_a, (unsigned long )cs_b,
|
||||
gflops);
|
||||
|
||||
fprintf(fout,"%c\t %c\t %c\t %c\t %4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", side_c, uploa_c, transa_c, diaga_c,
|
||||
(unsigned long )m, (unsigned long ) n,
|
||||
(unsigned long )cs_a, (unsigned long )cs_b,
|
||||
gflops);
|
||||
#else
|
||||
printf("%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
|
||||
(unsigned long )cs_a, (unsigned long )cs_b,
|
||||
gflops);
|
||||
fprintf(fout,"%4lu\t %4lu\t %4lu\t %4lu\t %6.3f\n", (unsigned long )m, (unsigned long ) n,
|
||||
(unsigned long )cs_a, (unsigned long )cs_b,
|
||||
gflops);
|
||||
#endif
|
||||
fflush(fout);
|
||||
|
||||
#else
|
||||
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )n, gflops );
|
||||
#endif
|
||||
bli_obj_free( &alpha );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
fclose(fin);
|
||||
fclose(fout);
|
||||
#endif
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -33,7 +33,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// n alpha x incx y incy
|
||||
|
||||
@@ -33,7 +33,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// res n x incx y incy
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
483
test/test_gemmt.c
Normal file
483
test/test_gemmt.c
Normal file
@@ -0,0 +1,483 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
//#define CBLAS
|
||||
//#define C_STOR_R
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, k_input;
|
||||
num_t dt;
|
||||
int r, n_repeats;
|
||||
uplo_t uploc;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
f77_char f77_uploc;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 200;
|
||||
p_end = 2000;
|
||||
p_inc = 200;
|
||||
|
||||
m_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
m_input = 5;
|
||||
k_input = 4;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
//dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
uploc = BLIS_LOWER;
|
||||
//uploc = BLIS_UPPER;
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
|
||||
char uplocl = tolower( f77_uploc );
|
||||
char transal = tolower( f77_transa );
|
||||
char transbl = tolower( f77_transb );
|
||||
|
||||
f77_int cbla_uploc = ( uplocl == 'l' ? CblasLower : CblasUpper );
|
||||
f77_int cbla_transa = ( transal == 'n' ? CblasNoTrans : CblasTrans );
|
||||
f77_int cbla_transb = ( transbl == 'n' ? CblasNoTrans : CblasTrans );
|
||||
|
||||
( void )cbla_uploc;
|
||||
( void )cbla_transa;
|
||||
( void )cbla_transb;
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
|
||||
#ifdef BLIS
|
||||
printf( "data_gemmt_blis" );
|
||||
#else
|
||||
printf( "data_gemmt_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
//for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
for ( p = p_end; p_begin <= p; p -= p_inc )
|
||||
{
|
||||
if ( m_input < 0 ) m = p * ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
#ifndef C_STOR_R
|
||||
if ( bli_does_trans( transa ) )
|
||||
bli_obj_create( dt, k, m, 0, 0, &a );
|
||||
else
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
|
||||
if ( bli_does_trans( transb ) )
|
||||
bli_obj_create( dt, m, k, 0, 0, &b );
|
||||
else
|
||||
bli_obj_create( dt, k, m, 0, 0, &b );
|
||||
|
||||
bli_obj_create( dt, m, m, 0, 0, &c );
|
||||
bli_obj_create( dt, m, m, 0, 0, &c_save );
|
||||
#else
|
||||
if ( bli_does_trans( transa ) )
|
||||
bli_obj_create( dt, k, m, -1, -1, &a );
|
||||
else
|
||||
bli_obj_create( dt, m, k, -1, -1, &a );
|
||||
|
||||
if ( bli_does_trans( transb ) )
|
||||
bli_obj_create( dt, m, k, -1, -1, &b );
|
||||
else
|
||||
bli_obj_create( dt, k, m, -1, -1, &b );
|
||||
|
||||
bli_obj_create( dt, m, m, -1, -1, &c );
|
||||
bli_obj_create( dt, m, m, -1, -1, &c_save );
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_uplo( uploc, &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
bli_setsc( (0.9/1.0), 0.2, &alpha );
|
||||
bli_setsc( -(1.1/1.0), 0.3, &beta );
|
||||
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "b", &b, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_gemmt( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
sgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
zgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
|
||||
#else // #ifdef CBLAS
|
||||
|
||||
f77_int cbla_storage = ( bli_obj_is_row_stored( &c ) ? CblasRowMajor
|
||||
: CblasColMajor );
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_sgemmt( cbla_storage,
|
||||
cbla_uploc,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_dgemmt( cbla_storage,
|
||||
cbla_uploc,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_cgemmt( cbla_storage,
|
||||
cbla_uploc,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
#ifdef C_STOR_R
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
#else
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
#endif
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_zgemmt( cbla_storage,
|
||||
cbla_uploc,
|
||||
cbla_transa,
|
||||
cbla_transb,
|
||||
mm,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_gemmt_blis" );
|
||||
#else
|
||||
printf( "data_gemmt_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// transa m n alpha a lda x incx beta y incy
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// m n alpha x incx y incy a lda
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// uploa m alpha a lda x incx beta y incy
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// uplo m alpha x incx a lda
|
||||
|
||||
@@ -32,7 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifdef WIN32
|
||||
#include <io.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include "blis.h"
|
||||
|
||||
// uplo m alpha x incx y incy a lda
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user