mirror of
https://github.com/amd/blis.git
synced 2026-06-08 14:58:42 +00:00
Merge branch 'amd' into rt
Details: - Merged contributions made by AMD via 'amd' branch (see summary below). Special thanks to AMD for their contributions to-date, especially with regard to intrinsic- and assembly-based kernels. - Added column storage output cases to microkernels in bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with the extra cost of transposing the microtile in registers, this is much faster than using the general storage case when the underlying matrix is column-stored. - Added s and d assembly-based zen gemmtrsm_u microkernel (including column storage optimization mentioned above). - Updated zen sub-configuration to reflect presence of new native kernels. - Temporarily reverted zen sub-configuration's level-3 cache blocksizes to smaller haswell values. - Temporarily disabled small matrix handling for zen configuration family in config/zen/bli_family_zen.h. - Updated zen CFLAGS according to changes in1e4365b. - Updated haswell microkernels such that: - only one vzeroupper instruction is called prior to returning - movapd/movupd are used in leiu of movaps/movups for double-real microkernels. (Note that single-real microkernels still use movaps/movups.) - Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is now included via frame/include/bli_arch_config.h. - Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation in testsuite/src/test_amaxv.c). - Added early return for alpha == 0 in bli_dotxv_ref.c. - Integrated changes fromf07b176, including a fix for undefined behavior when executing the 1m method under certain conditions. - Updated config_registry; no longer need haswell kernels for zen sub-configuration. - Tweaked marginal and pass thresholds for dotxf. - Reformatted level-1v, -1f, and -3 amd kernels and inserted additional comments. - Updated LICENSE file to explicitly mention that parts are copyright UT-Austin and AMD. - Added AMD copyright to header templates in build/templates. Summary of previous changes from 'amd' branch. - Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and s and d assembly-based zen gemmtrsm_l microkernels (d6x8). - Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv, and scalv, with extra-unrolling variants for axpyv and scalv. - Added a small matrix handler to bli_gemm_front(), with the handler implemented in kernels/zen/3/bli_gemm_small_matrix.c. - Added additional logic to sumsqv that first attempts to compute the sum of the squares via dotv(). If there is a floating-point exception (FE_OVERFLOW), then the previous (numerically conservative) code is used; otherwise, the result of dotv() is square-rooted and stored as the result. This new implementation is only enabled when FE_OVERFLOW is #defined. If the macro is not #defined, then the previous implementation is used. - Added axpyv and dotv standalone test drivers to test directory. - Added zen support to old cpuid_x86.c driver in build/auto-detect/old. - Added thread-local and __attribute__-related macros to bli_macro_defs.h.
This commit is contained in:
21
LICENSE
21
LICENSE
@@ -1,3 +1,18 @@
|
||||
NOTE: Portions of this project's code are copyrighted by
|
||||
|
||||
The University of Texas at Austin
|
||||
|
||||
while other portions are copyrighted by
|
||||
|
||||
Advanced Micro Devices, Inc.
|
||||
|
||||
with some overlap. Please see file-level license headers for file-specific
|
||||
copyright info. All parties provide their portions of the code under the
|
||||
3-clause BSD license, found below.
|
||||
|
||||
---
|
||||
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -8,9 +23,9 @@ met:
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
- Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2015, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -233,6 +234,14 @@ int cpu_detect()
|
||||
else
|
||||
return CPUNAME_GENERIC; //OS don't support AVX.
|
||||
}
|
||||
case 8:
|
||||
switch (model){
|
||||
case 1:
|
||||
if(support_avx())
|
||||
return CPUNAME_ZEN;
|
||||
else
|
||||
return CPUNAME_REFERENCE; //OS don't support AVX.
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -14,7 +15,7 @@
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
- Neither the name of copyright holder(s) nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -14,7 +15,7 @@
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
- Neither the name of copyright holder(s) nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#!/bin/sh
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -14,7 +15,7 @@
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas at Austin nor the names
|
||||
# - Neither the name of copyright holder(s) nor the names
|
||||
# of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
|
||||
@@ -46,12 +46,61 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
6,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_zen_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_zen_asm_6x8, TRUE,
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_zen_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_zen_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_zen_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_zen_asm_6x8, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -59,20 +108,28 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
|
||||
//bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Advanced Micro Devices, Inc
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -35,6 +36,19 @@
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
// By default, it is effective to parallelize the outer loops.
|
||||
// Setting these macros to 1 will force JR and IR inner loops
|
||||
// to be not paralleized.
|
||||
#define BLIS_DEFAULT_MR_THREAD_MAX 1
|
||||
#define BLIS_DEFAULT_NR_THREAD_MAX 1
|
||||
|
||||
|
||||
//#define BLIS_ENABLE_SMALL_MATRIX
|
||||
|
||||
// This will select the threshold below which small matrix code will be called.
|
||||
//#define BLIS_SMALL_MATRIX_THRES 700
|
||||
//#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
|
||||
//#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
|
||||
|
||||
|
||||
|
||||
|
||||
274
config/zen/bli_kernel.h
Normal file
274
config/zen/bli_kernel.h
Normal file
@@ -0,0 +1,274 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_KERNEL_H
|
||||
#define BLIS_KERNEL_H
|
||||
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
//
|
||||
|
||||
// threading related
|
||||
// By default it is effective to paralleize the
|
||||
// outerloops. Setting these macros to 1 will force
|
||||
// JR and NR inner loops to be not paralleized.
|
||||
#define BLIS_DEFAULT_MR_THREAD_MAX 1
|
||||
#define BLIS_DEFAULT_NR_THREAD_MAX 1
|
||||
|
||||
// sgemm micro-kernel
|
||||
|
||||
#if 0
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4
|
||||
#define BLIS_DEFAULT_MC_S 264
|
||||
#define BLIS_DEFAULT_KC_S 128
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 24
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 16
|
||||
#define BLIS_DEFAULT_NR_S 6
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
|
||||
#define BLIS_DEFAULT_MC_S 144
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 4080
|
||||
#define BLIS_DEFAULT_MR_S 6
|
||||
#define BLIS_DEFAULT_NR_S 16
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
|
||||
#endif
|
||||
|
||||
// dgemm micro-kernel
|
||||
|
||||
#if 0
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4
|
||||
#define BLIS_DEFAULT_MC_D 96
|
||||
#define BLIS_DEFAULT_KC_D 192
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 12
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
|
||||
#define BLIS_DEFAULT_MC_D 72
|
||||
#define BLIS_DEFAULT_KC_D 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 8
|
||||
#define BLIS_DEFAULT_NR_D 6
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
|
||||
#define BLIS_DEFAULT_MC_D 510 // 72 /* Improves performance for large Matrices */
|
||||
#define BLIS_DEFAULT_KC_D 1024 // 256
|
||||
#define BLIS_DEFAULT_NC_D 4080
|
||||
#define BLIS_DEFAULT_MR_D 6
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
// cgemm micro-kernel
|
||||
|
||||
#if 1
|
||||
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
|
||||
#define BLIS_DEFAULT_MC_C 144
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4080
|
||||
#define BLIS_DEFAULT_MR_C 3
|
||||
#define BLIS_DEFAULT_NR_C 8
|
||||
|
||||
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
// zgemm micro-kernel
|
||||
|
||||
#if 1
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
|
||||
#define BLIS_DEFAULT_MC_Z 72
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 4080
|
||||
#define BLIS_DEFAULT_MR_Z 3
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
|
||||
// zgemm micro-kernel
|
||||
|
||||
#if 1
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
|
||||
#define BLIS_DEFAULT_MC_Z 72
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 4080
|
||||
#define BLIS_DEFAULT_MR_Z 3
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
|
||||
#endif
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define BLIS_STRSM_L_UKERNEL bli_strsm_l_int_6x16
|
||||
#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_int_6x8
|
||||
|
||||
// --gemmtrsm-related --
|
||||
#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_6x16
|
||||
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_6x8
|
||||
|
||||
#define BLIS_SMALL_MATRIX_ENABLE
|
||||
//This will select the threshold below which small matrix code will be called.
|
||||
#define BLIS_SMALL_MATRIX_THRES 700
|
||||
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
|
||||
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
|
||||
|
||||
gint_t bli_gemm_small_matrix
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
#define BLIS_DEFAULT_1F_S 8
|
||||
#define BLIS_DEFAULT_1F_D 4
|
||||
|
||||
|
||||
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- axpy2v --
|
||||
|
||||
// -- dotaxpyv --
|
||||
|
||||
// -- axpyf --
|
||||
#define BLIS_SAXPYF_KERNEL bli_saxpyf_int_var1
|
||||
#define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1
|
||||
|
||||
// -- dotxf --
|
||||
#define BLIS_SDOTXF_KERNEL bli_sdotxf_int_var1
|
||||
#define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1
|
||||
|
||||
// -- dotxaxpyf --
|
||||
|
||||
|
||||
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
|
||||
|
||||
// -- packm --
|
||||
|
||||
// -- unpackm --
|
||||
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
|
||||
// -- amax --
|
||||
#define BLIS_SAMAXV_KERNEL bli_samaxv_opt_var1
|
||||
#define BLIS_DAMAXV_KERNEL bli_damaxv_opt_var1
|
||||
// -- addv --
|
||||
|
||||
// -- axpyv --
|
||||
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var10
|
||||
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var10
|
||||
|
||||
|
||||
// -- copyv --
|
||||
|
||||
// -- dotv --
|
||||
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
|
||||
#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1
|
||||
|
||||
// -- dotxv --
|
||||
#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1
|
||||
#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1
|
||||
|
||||
// -- invertv --
|
||||
|
||||
// -- scal2v --
|
||||
|
||||
// -- scalv --
|
||||
#define BLIS_SSCALV_KERNEL bli_sscalv_opt_var2
|
||||
#define BLIS_DSCALV_KERNEL bli_dscalv_opt_var2
|
||||
|
||||
// -- setv --
|
||||
|
||||
// -- subv --
|
||||
|
||||
// -- swapv --
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -62,16 +62,20 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
COPTFLAGS := -O3 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4
|
||||
# gcc 6.3 or later:
|
||||
#CVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=znver1
|
||||
# gcc 5.4:
|
||||
# possibly add zen-specific instructions: -mclzero -madx -mrdseed -mmwaitx -msha -mxsavec -mxsaves -mclflushopt -mpopcnt
|
||||
CVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4
|
||||
CVECFLAGS := -mavx2 -mfpmath=sse -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
|
||||
else
|
||||
$(error gcc or clang are required for this configuration.)
|
||||
endif
|
||||
|
||||
@@ -21,7 +21,7 @@ knl: knl
|
||||
skx: skx
|
||||
|
||||
# AMD architectures.
|
||||
zen: zen/haswell
|
||||
zen: zen
|
||||
excavator: excavator/piledriver
|
||||
steamroller: steamroller/piledriver
|
||||
piledriver: piledriver
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,11 @@ void bli_gemm_front
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX
|
||||
gint_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
|
||||
if ( status == BLIS_SUCCESS ) return;
|
||||
#endif
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
@@ -82,9 +88,9 @@ void bli_gemm_front
|
||||
|
||||
// Record the threading for each level within the context.
|
||||
bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx,
|
||||
bli_obj_length( c_local ),
|
||||
bli_obj_width( c_local ),
|
||||
bli_obj_width( a_local ) );
|
||||
bli_obj_length( c_local ),
|
||||
bli_obj_width( c_local ),
|
||||
bli_obj_width( a_local ) );
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
|
||||
@@ -42,3 +42,14 @@ void bli_gemm_front
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
err_t bli_gemm_small
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -214,9 +214,9 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- AMD64 architectures --
|
||||
|
||||
//#ifdef BLIS_KERNELS_ZEN
|
||||
//#include "bli_kernels_zen.h"
|
||||
//#endif
|
||||
#ifdef BLIS_KERNELS_ZEN
|
||||
#include "bli_kernels_zen.h"
|
||||
#endif
|
||||
//#ifdef BLIS_KERNELS_EXCAVATOR
|
||||
//#include "bli_kernels_excavator.h"
|
||||
//#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -64,6 +65,49 @@
|
||||
#endif
|
||||
|
||||
|
||||
// -- BLIS Thread Local Storage Keyword --
|
||||
|
||||
// __thread for TLS is supported by GCC, CLANG, ICC, and IBMC.
|
||||
// There is a small risk here as __GNUC__ can also be defined by some other
|
||||
// compiler (other than ICC and CLANG which we know define it) that
|
||||
// doesn't support __thread, as __GNUC__ is not quite unique to GCC.
|
||||
// But the possibility of someone using such non-main-stream compiler
|
||||
// for building BLIS is low.
|
||||
#if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__)
|
||||
#define BLIS_THREAD_LOCAL __thread
|
||||
#else
|
||||
#define BLIS_THREAD_LOCAL
|
||||
#endif
|
||||
|
||||
|
||||
// -- BLIS constructor/destructor function attribute --
|
||||
|
||||
// __attribute__((constructor/destructor)) is supported by GCC only.
|
||||
// There is a small risk here as __GNUC__ can also be defined by some other
|
||||
// compiler (other than ICC and CLANG which we know define it) that
|
||||
// doesn't support this, as __GNUC__ is not quite unique to GCC.
|
||||
// But the possibility of someone using such non-main-stream compiler
|
||||
// for building BLIS is low.
|
||||
|
||||
#if defined(__ICC) || defined(__INTEL_COMPILER)
|
||||
// ICC defines __GNUC__ but doesn't support this
|
||||
#define BLIS_ATTRIB_CTOR
|
||||
#define BLIS_ATTRIB_DTOR
|
||||
#elif defined(__clang__)
|
||||
// CLANG supports __attribute__, but its documentation doesn't
|
||||
// mention support for constructor/destructor. Compiling with
|
||||
// clang and testing shows that it does support.
|
||||
#define BLIS_ATTRIB_CTOR __attribute__((constructor))
|
||||
#define BLIS_ATTRIB_DTOR __attribute__((destructor))
|
||||
#elif defined(__GNUC__)
|
||||
#define BLIS_ATTRIB_CTOR __attribute__((constructor))
|
||||
#define BLIS_ATTRIB_DTOR __attribute__((destructor))
|
||||
#else
|
||||
#define BLIS_ATTRIB_CTOR
|
||||
#define BLIS_ATTRIB_DTOR
|
||||
#endif
|
||||
|
||||
|
||||
// -- Concatenation macros --
|
||||
|
||||
#define BLIS_FUNC_PREFIX_STR "bli"
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -53,7 +54,15 @@ static void* bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
|
||||
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat },
|
||||
};
|
||||
|
||||
static bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
|
||||
//
|
||||
// NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
|
||||
//
|
||||
// BLIS provides APIs to modify this state during runtime. So, one application thread
|
||||
// can modify the state, before another starts the corresponding BLIS operation.
|
||||
// This is solved by making the induced method status array local to threads.
|
||||
|
||||
static BLIS_THREAD_LOCAL
|
||||
bool_t bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
|
||||
{
|
||||
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
|
||||
/* c z */
|
||||
|
||||
@@ -82,9 +82,9 @@ void PASTEMAC(opname,imeth) \
|
||||
\
|
||||
/* Query a context for the current induced method. This context is
|
||||
managed and cached by the gks and should not be freed by the caller.
|
||||
Note that we pass in the datatype because it will be needed when
|
||||
bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
|
||||
family of functions. */ \
|
||||
Note that the datatype argument is needed because it will be passed
|
||||
in when bli_gks_query_ind_cntx() eventually calls the induced method's
|
||||
_cntx_init() function. */ \
|
||||
cntx = bli_gks_query_ind_cntx( ind, dt ); \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
@@ -162,9 +162,9 @@ void PASTEMAC(opname,imeth) \
|
||||
\
|
||||
/* Query a context for the current induced method. This context is
|
||||
managed and cached by the gks and should not be freed by the caller.
|
||||
Note that we pass in the datatype because it will be needed when
|
||||
bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
|
||||
family of functions. */ \
|
||||
Note that the datatype argument is needed because it will be passed
|
||||
in when bli_gks_query_ind_cntx() eventually calls the induced method's
|
||||
_cntx_init() function. */ \
|
||||
cntx = bli_gks_query_ind_cntx( ind, dt ); \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
@@ -240,9 +240,9 @@ void PASTEMAC(opname,imeth) \
|
||||
\
|
||||
/* Query a context for the current induced method. This context is
|
||||
managed and cached by the gks and should not be freed by the caller.
|
||||
Note that we pass in the datatype because it will be needed when
|
||||
bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
|
||||
family of functions. */ \
|
||||
Note that the datatype argument is needed because it will be passed
|
||||
in when bli_gks_query_ind_cntx() eventually calls the induced method's
|
||||
_cntx_init() function. */ \
|
||||
cntx = bli_gks_query_ind_cntx( ind, dt ); \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
@@ -309,9 +309,9 @@ void PASTEMAC(opname,imeth) \
|
||||
\
|
||||
/* Query a context for the current induced method. This context is
|
||||
managed and cached by the gks and should not be freed by the caller.
|
||||
Note that we pass in the datatype because it will be needed when
|
||||
bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
|
||||
family of functions. */ \
|
||||
Note that the datatype argument is needed because it will be passed
|
||||
in when bli_gks_query_ind_cntx() eventually calls the induced method's
|
||||
_cntx_init() function. */ \
|
||||
cntx = bli_gks_query_ind_cntx( ind, dt ); \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
@@ -364,9 +364,9 @@ void PASTEMAC(opname,imeth) \
|
||||
\
|
||||
/* Query a context for the current induced method. This context is
|
||||
managed and cached by the gks and should not be freed by the caller.
|
||||
Note that we pass in the datatype because it will be needed when
|
||||
bli_gks_query_ind_cntx() eventually calls the bli_ind_cntx_init()
|
||||
family of functions. */ \
|
||||
Note that the datatype argument is needed because it will be passed
|
||||
in when bli_gks_query_ind_cntx() eventually calls the induced method's
|
||||
_cntx_init() function. */ \
|
||||
cntx = bli_gks_query_ind_cntx( ind, dt ); \
|
||||
\
|
||||
{ \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -33,6 +34,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <fenv.h>
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with typed operands.
|
||||
@@ -293,7 +295,129 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
|
||||
//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( scomplex, float, c, s, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
|
||||
#undef GENTFUNCR
|
||||
#ifdef FE_OVERFLOW
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype_r* norm, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* zero = PASTEMAC(chr,0); \
|
||||
ctype_r* one = PASTEMAC(chr,1); \
|
||||
ctype_r scale; \
|
||||
ctype_r sumsq; \
|
||||
ctype_r sqrt_sumsq; \
|
||||
\
|
||||
/* Initialize scale and sumsq to begin the summation. */ \
|
||||
PASTEMAC(chr,copys)( *zero, scale ); \
|
||||
PASTEMAC(chr,copys)( *one, sumsq ); \
|
||||
\
|
||||
/* An optimization: first try to use dotv to compute the sum of
|
||||
the squares of the vector. If no floating-point exceptions
|
||||
(specifically, overflow and invalid exceptions) were produced,
|
||||
then we accept the computed value and returne early. The cost
|
||||
of this optimization is the "sunk" cost of the initial dotv
|
||||
when sumsqv must be used instead. However, we expect that the
|
||||
vast majority of use cases will not produce exceptions, and
|
||||
therefore only one pass through the data, via dotv, will be
|
||||
required. */ \
|
||||
if ( TRUE ) \
|
||||
{ \
|
||||
int f_exp_raised;\
|
||||
ctype sumsqc; \
|
||||
\
|
||||
feclearexcept( FE_ALL_EXCEPT );\
|
||||
\
|
||||
PASTEMAC(ch,dotv)\
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
n,\
|
||||
x, incx, \
|
||||
x, incx, \
|
||||
&sumsqc, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
|
||||
\
|
||||
f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
|
||||
\
|
||||
if ( !f_exp_raised ) \
|
||||
{ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
|
||||
return; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Compute the sum of the squares of the vector. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
n, \
|
||||
x, incx, \
|
||||
&scale, \
|
||||
&sumsq, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Compute: norm = scale * sqrt( sumsq ) */ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
|
||||
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
|
||||
\
|
||||
/* Store the final value to the output variable. */ \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
#else
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* x, inc_t incx, \
|
||||
ctype_r* norm, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* zero = PASTEMAC(chr,0); \
|
||||
ctype_r* one = PASTEMAC(chr,1); \
|
||||
ctype_r scale; \
|
||||
ctype_r sumsq; \
|
||||
ctype_r sqrt_sumsq; \
|
||||
\
|
||||
/* Initialize scale and sumsq to begin the summation. */ \
|
||||
PASTEMAC(chr,copys)( *zero, scale ); \
|
||||
PASTEMAC(chr,copys)( *one, sumsq ); \
|
||||
\
|
||||
/* Compute the sum of the squares of the vector. */ \
|
||||
\
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
n, \
|
||||
x, incx, \
|
||||
&scale, \
|
||||
&sumsq, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Compute: norm = scale * sqrt( sumsq ) */ \
|
||||
PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
|
||||
PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
|
||||
\
|
||||
/* Store the final value to the output variable. */ \
|
||||
PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
|
||||
}
|
||||
#endif
|
||||
GENTFUNCR( float, float, s, s, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
GENTFUNCR( double, double, d, d, normfv_unb_var1, sumsqv_unb_var1 )
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
@@ -898,7 +1022,7 @@ void PASTEMAC(ch,varname) \
|
||||
n_elem - 1, \
|
||||
&beta, \
|
||||
x0, incx, \
|
||||
cntx \
|
||||
cntx \
|
||||
); \
|
||||
*/ \
|
||||
} \
|
||||
@@ -937,7 +1061,7 @@ void PASTEMAC(ch,varname) \
|
||||
n_elem - 1, \
|
||||
&beta, \
|
||||
x2, incx, \
|
||||
cntx \
|
||||
cntx \
|
||||
); \
|
||||
*/ \
|
||||
} \
|
||||
|
||||
@@ -634,8 +634,6 @@ void bli_sgemm_haswell_asm_24x4
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1257,8 +1255,6 @@ void bli_dgemm_haswell_asm_12x4
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
|
||||
@@ -600,8 +600,6 @@ void bli_sgemm_haswell_asm_4x24
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1019,46 +1017,46 @@ void bli_dgemm_haswell_asm_4x12
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
"vfmadd231pd (%%r12), %%ymm3, %%ymm6 \n\t"
|
||||
"vmovups %%ymm6, (%%r12) \n\t"
|
||||
"vmovupd %%ymm6, (%%r12) \n\t"
|
||||
"addq %%rdi, %%r12 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm7 \n\t"
|
||||
"vmovups %%ymm7, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm7, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm8 \n\t"
|
||||
"vmovups %%ymm8, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm8, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
"vfmadd231pd (%%r12), %%ymm3, %%ymm9 \n\t"
|
||||
"vmovups %%ymm9, (%%r12) \n\t"
|
||||
"vmovupd %%ymm9, (%%r12) \n\t"
|
||||
"addq %%rdi, %%r12 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
"vfmadd231pd (%%r12), %%ymm3, %%ymm12 \n\t"
|
||||
"vmovups %%ymm12, (%%r12) \n\t"
|
||||
"vmovupd %%ymm12, (%%r12) \n\t"
|
||||
"addq %%rdi, %%r12 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm13 \n\t"
|
||||
"vmovups %%ymm13, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm13, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm14 \n\t"
|
||||
"vmovups %%ymm14, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm14, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
"vfmadd231pd (%%r12), %%ymm3, %%ymm15 \n\t"
|
||||
"vmovups %%ymm15, (%%r12) \n\t"
|
||||
"vmovupd %%ymm15, (%%r12) \n\t"
|
||||
//"addq %%rdi, %%r12 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1190,8 +1188,6 @@ void bli_dgemm_haswell_asm_4x12
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
|
||||
@@ -595,8 +595,6 @@ void bli_sgemm_haswell_asm_6x16
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -634,10 +632,10 @@ void bli_sgemm_haswell_asm_6x16
|
||||
"vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\
|
||||
"vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/
|
||||
|
||||
#define DGEMM_OUTPUT_GS_BETA_NZ \
|
||||
@@ -682,8 +680,8 @@ void bli_dgemm_haswell_asm_6x8
|
||||
" \n\t"
|
||||
"addq $32 * 4, %%rbx \n\t"
|
||||
" \n\t" // initialize loop by pre-loading
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %7, %%rdi \n\t" // load rs_c
|
||||
@@ -734,12 +732,10 @@ void bli_dgemm_haswell_asm_6x8
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps -2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 1
|
||||
"prefetcht0 72 * 8(%%rax) \n\t"
|
||||
" \n\t"
|
||||
"vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t"
|
||||
"vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t"
|
||||
"vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
|
||||
@@ -761,11 +757,11 @@ void bli_dgemm_haswell_asm_6x8
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps 1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd 0 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd 1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 2
|
||||
"prefetcht0 80 * 8(%%rax) \n\t"
|
||||
"prefetcht0 76 * 8(%%rax) \n\t"
|
||||
" \n\t"
|
||||
"vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t"
|
||||
"vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t"
|
||||
@@ -788,8 +784,8 @@ void bli_dgemm_haswell_asm_6x8
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps 3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd 2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd 3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 3
|
||||
"vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t"
|
||||
@@ -816,8 +812,8 @@ void bli_dgemm_haswell_asm_6x8
|
||||
"addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr)
|
||||
"addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -864,8 +860,8 @@ void bli_dgemm_haswell_asm_6x8
|
||||
"addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr)
|
||||
"addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -1008,50 +1004,50 @@ void bli_dgemm_haswell_asm_6x8
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1070,64 +1066,64 @@ void bli_dgemm_haswell_asm_6x8
|
||||
".DGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm4, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm6, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm6, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm8, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm10, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm12, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm14, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm5, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm5, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm7, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm7, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm9, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm9, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm11, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm11, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm13, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm15, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm15, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1139,38 +1135,38 @@ void bli_dgemm_haswell_asm_6x8
|
||||
".DROWSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1179,8 +1175,6 @@ void bli_dgemm_haswell_asm_6x8
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1710,8 +1704,6 @@ void bli_cgemm_haswell_asm_3x8
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".CDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1761,7 +1753,7 @@ void bli_cgemm_haswell_asm_3x8
|
||||
"vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \
|
||||
|
||||
#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
|
||||
"vmovups (%%rcx), %%ymm0 \n\t" \
|
||||
"vmovupd (%%rcx), %%ymm0 \n\t" \
|
||||
"vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \
|
||||
"vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \
|
||||
"vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \
|
||||
@@ -1803,8 +1795,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
" \n\t"
|
||||
"addq $32 * 4, %%rbx \n\t"
|
||||
" \n\t" // initialize loop by pre-loading
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %7, %%rdi \n\t" // load rs_c
|
||||
@@ -1854,8 +1846,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps -2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 1
|
||||
"vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t"
|
||||
@@ -1879,8 +1871,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps 1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd 0 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd 1 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 2
|
||||
"prefetcht0 38 * 16(%%rax) \n\t"
|
||||
@@ -1906,8 +1898,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps 3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd 2 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd 3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 3
|
||||
"vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t"
|
||||
@@ -1934,8 +1926,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
"addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr)
|
||||
"addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -1982,8 +1974,8 @@ void bli_zgemm_haswell_asm_3x4
|
||||
"addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr)
|
||||
"addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rbx), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rbx), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -2186,34 +2178,34 @@ void bli_zgemm_haswell_asm_3x4
|
||||
".ZGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm4, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm5, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm5, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm8, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm9, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm9, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm12, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm13, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -2225,14 +2217,14 @@ void bli_zgemm_haswell_asm_3x4
|
||||
".ZROWSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm8, (%%r11) \n\t"
|
||||
"vmovups %%ymm9, (%%r11,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm8, (%%r11) \n\t"
|
||||
"vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm12, (%%r12) \n\t"
|
||||
"vmovups %%ymm13, (%%r12,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm12, (%%r12) \n\t"
|
||||
"vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -2240,8 +2232,6 @@ void bli_zgemm_haswell_asm_3x4
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".ZDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
|
||||
@@ -596,8 +596,6 @@ void bli_sgemm_haswell_asm_16x6
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -632,10 +630,10 @@ void bli_sgemm_haswell_asm_16x6
|
||||
"vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\
|
||||
"vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
|
||||
"vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
|
||||
"vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
|
||||
"vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/
|
||||
|
||||
#define DGEMM_OUTPUT_GS_BETA_NZ \
|
||||
@@ -680,8 +678,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
" \n\t"
|
||||
"addq $32 * 4, %%rax \n\t"
|
||||
" \n\t" // initialize loop by pre-loading
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
@@ -732,8 +730,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps -2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -1 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 1
|
||||
"vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t"
|
||||
@@ -757,8 +755,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 2
|
||||
"prefetcht0 76 * 8(%%rax) \n\t"
|
||||
@@ -784,8 +782,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd 3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 3
|
||||
"vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t"
|
||||
@@ -812,8 +810,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr)
|
||||
"addq $4 * 6 * 8, %%rbx \n\t" // b += 4*6 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -860,8 +858,8 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr)
|
||||
"addq $1 * 6 * 8, %%rbx \n\t" // b += 1*6 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -1006,50 +1004,50 @@ void bli_dgemm_haswell_asm_8x6
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1068,32 +1066,32 @@ void bli_dgemm_haswell_asm_8x6
|
||||
".DGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm4, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm6, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm6, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm8, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm10, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm12, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm14, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
@@ -1101,32 +1099,32 @@ void bli_dgemm_haswell_asm_8x6
|
||||
"movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm5, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm5, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm7, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm7, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm9, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm9, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm11, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm11, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm13, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm15, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm15, %%ymm0 \n\t"
|
||||
DGEMM_OUTPUT_GS_BETA_NZ
|
||||
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
@@ -1139,38 +1137,38 @@ void bli_dgemm_haswell_asm_8x6
|
||||
".DCOLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
"vmovupd %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1180,8 +1178,6 @@ void bli_dgemm_haswell_asm_8x6
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1711,8 +1707,6 @@ void bli_cgemm_haswell_asm_8x3
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".CDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
@@ -1804,8 +1798,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
" \n\t"
|
||||
"addq $32 * 4, %%rax \n\t"
|
||||
" \n\t" // initialize loop by pre-loading
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
@@ -1855,8 +1849,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps -2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -1 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 1
|
||||
"vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t"
|
||||
@@ -1880,8 +1874,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 2
|
||||
"prefetcht0 38 * 16(%%rax) \n\t"
|
||||
@@ -1907,8 +1901,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
|
||||
"vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd 3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 3
|
||||
"vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t"
|
||||
@@ -1935,8 +1929,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
"addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr)
|
||||
"addq $4 * 3 * 16, %%rbx \n\t" // b += 4*3 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -1983,8 +1977,8 @@ void bli_zgemm_haswell_asm_4x3
|
||||
"addq $1 * 4 * 16, %%rax \n\t" // a += 1*4 (unroll x mr)
|
||||
"addq $1 * 3 * 16, %%rbx \n\t" // b += 1*3 (unroll x nr)
|
||||
" \n\t"
|
||||
"vmovaps -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovaps -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vmovapd -4 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vmovapd -3 * 32(%%rax), %%ymm1 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
@@ -2187,34 +2181,34 @@ void bli_zgemm_haswell_asm_4x3
|
||||
".ZGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm4, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm5, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm5, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm8, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm9, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm9, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm12, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
"addq %%rdx, %%rcx \n\t" // c += 2*rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, %%ymm0 \n\t"
|
||||
"vmovapd %%ymm13, %%ymm0 \n\t"
|
||||
ZGEMM_OUTPUT_GS
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -2226,14 +2220,14 @@ void bli_zgemm_haswell_asm_4x3
|
||||
".ZCOLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm4, (%%rcx) \n\t"
|
||||
"vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm8, (%%r11) \n\t"
|
||||
"vmovups %%ymm9, (%%r11,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm8, (%%r11) \n\t"
|
||||
"vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
"vmovups %%ymm12, (%%r12) \n\t"
|
||||
"vmovups %%ymm13, (%%r12,%%rdx,1) \n\t"
|
||||
"vmovupd %%ymm12, (%%r12) \n\t"
|
||||
"vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -2241,8 +2235,6 @@ void bli_zgemm_haswell_asm_4x3
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".ZDONE: \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
"vzeroupper \n\t"
|
||||
" \n\t"
|
||||
|
||||
484
kernels/zen/1/bli_amaxv_zen_int.c
Normal file
484
kernels/zen/1/bli_amaxv_zen_int.c
Normal file
@@ -0,0 +1,484 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016, Advanced Micro Devices, Inc
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
__m128 v;
|
||||
float f[4];
|
||||
} v4sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
}v4df_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
__m128d v;
|
||||
double d[2];
|
||||
}v2dd_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_samaxv_zen_int
|
||||
(
|
||||
dim_t n,
|
||||
float* x, inc_t incx,
|
||||
dim_t* i_max,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
float* minus_one = PASTEMAC(s,m1);
|
||||
dim_t* zero_i = PASTEMAC(i,0);
|
||||
|
||||
float chi1_r;
|
||||
//float chi1_i;
|
||||
float abs_chi1;
|
||||
float abs_chi1_max;
|
||||
dim_t i_max_l;
|
||||
dim_t i;
|
||||
|
||||
/* If the vector length is zero, return early. This directly emulates
|
||||
the behavior of netlib BLAS's i?amax() routines. */
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(i,copys)( *zero_i, *i_max );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Initialize the index of the maximum absolute value to zero. */
|
||||
PASTEMAC(i,copys)( *zero_i, i_max_l );
|
||||
|
||||
/* Initialize the maximum absolute value search candidate with
|
||||
-1, which is guaranteed to be less than all values we will
|
||||
compute. */
|
||||
PASTEMAC(s,copys)( *minus_one, abs_chi1_max );
|
||||
|
||||
// For non-unit strides, or very small vector lengths, compute with
|
||||
// scalar code.
|
||||
if ( incx != 1 || n < 8 )
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
float* chi1 = x + (i )*incx;
|
||||
|
||||
/* Get the real and imaginary components of chi1. */
|
||||
chi1_r = *chi1;
|
||||
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */
|
||||
chi1_r = fabsf( chi1_r );
|
||||
|
||||
/* Add the real and imaginary absolute values together. */
|
||||
abs_chi1 = chi1_r;
|
||||
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dim_t n_iter, n_left;
|
||||
dim_t num_vec_elements = 8;
|
||||
v8sf_t x_vec, max_vec, maxInx_vec, mask_vec;
|
||||
v8sf_t idx_vec, inc_vec;
|
||||
v8sf_t sign_mask;
|
||||
|
||||
v4sf_t max_vec_lo, max_vec_hi, mask_vec_lo;
|
||||
v4sf_t maxInx_vec_lo, maxInx_vec_hi;
|
||||
|
||||
n_iter = n / num_vec_elements;
|
||||
n_left = n % num_vec_elements;
|
||||
|
||||
idx_vec.v = _mm256_set_ps( 7, 6, 5, 4, 3, 2, 1, 0 );
|
||||
inc_vec.v = _mm256_set1_ps( 8 );
|
||||
max_vec.v = _mm256_set1_ps( -1 );
|
||||
maxInx_vec.v = _mm256_setzero_ps();
|
||||
sign_mask.v = _mm256_set1_ps( -0.f );
|
||||
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
x_vec.v = _mm256_loadu_ps( x );
|
||||
|
||||
// Get the absolute value of the vector element.
|
||||
x_vec.v = _mm256_andnot_ps( sign_mask.v, x_vec.v );
|
||||
|
||||
mask_vec.v = _mm256_cmp_ps( x_vec.v, max_vec.v, _CMP_GT_OS );
|
||||
|
||||
max_vec.v = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v );
|
||||
maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v );
|
||||
|
||||
idx_vec.v += inc_vec.v;
|
||||
x += num_vec_elements;
|
||||
}
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 );
|
||||
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
|
||||
maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 );
|
||||
maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
|
||||
mask_vec_lo.v = _mm_cmp_ps( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
if ( max_vec_lo.f[0] > max_vec_lo.f[1] )
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.f[0];
|
||||
i_max_l = maxInx_vec_lo.f[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.f[1];
|
||||
i_max_l = maxInx_vec_lo.f[1];
|
||||
}
|
||||
|
||||
for ( i = n - n_left; i < n; i++ )
|
||||
{
|
||||
float* chi1 = x;
|
||||
|
||||
/* Get the real and imaginary components of chi1. */
|
||||
chi1_r = *chi1;
|
||||
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */
|
||||
abs_chi1 = fabsf( chi1_r );
|
||||
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
}
|
||||
|
||||
x += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
/* Store final index to output variable. */
|
||||
*i_max = i_max_l;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_damaxv_zen_int
|
||||
(
|
||||
dim_t n,
|
||||
double* x, inc_t incx,
|
||||
dim_t* i_max,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
double* minus_one = PASTEMAC(d,m1);
|
||||
dim_t* zero_i = PASTEMAC(i,0);
|
||||
|
||||
double chi1_r;
|
||||
//double chi1_i;
|
||||
double abs_chi1;
|
||||
double abs_chi1_max;
|
||||
dim_t i_max_l;
|
||||
dim_t i;
|
||||
|
||||
/* If the vector length is zero, return early. This directly emulates
|
||||
the behavior of netlib BLAS's i?amax() routines. */
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(i,copys)( *zero_i, *i_max );
|
||||
return;
|
||||
}
|
||||
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( *zero_i, i_max_l );
|
||||
|
||||
/* Initialize the maximum absolute value search candidate with
|
||||
-1, which is guaranteed to be less than all values we will
|
||||
compute. */
|
||||
PASTEMAC(d,copys)( *minus_one, abs_chi1_max );
|
||||
|
||||
// For non-unit strides, or very small vector lengths, compute with
|
||||
// scalar code.
|
||||
if ( incx != 1 || n < 4 )
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
double* chi1 = x + (i )*incx;
|
||||
|
||||
/* Get the real and imaginary components of chi1. */
|
||||
chi1_r = *chi1;
|
||||
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */
|
||||
chi1_r = fabs( chi1_r );
|
||||
|
||||
/* Add the real and imaginary absolute values together. */
|
||||
abs_chi1 = chi1_r;
|
||||
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
dim_t n_iter, n_left;
|
||||
dim_t num_vec_elements = 4;
|
||||
v4df_t x_vec, max_vec, maxInx_vec, mask_vec;
|
||||
v4df_t idx_vec, inc_vec;
|
||||
v4df_t sign_mask;
|
||||
|
||||
v2dd_t max_vec_lo, max_vec_hi, mask_vec_lo;
|
||||
v2dd_t maxInx_vec_lo, maxInx_vec_hi;
|
||||
|
||||
n_iter = n / num_vec_elements;
|
||||
n_left = n % num_vec_elements;
|
||||
|
||||
idx_vec.v = _mm256_set_pd( 3, 2, 1, 0 );
|
||||
inc_vec.v = _mm256_set1_pd( 4 );
|
||||
max_vec.v = _mm256_set1_pd( -1 );
|
||||
maxInx_vec.v = _mm256_setzero_pd();
|
||||
sign_mask.v = _mm256_set1_pd( -0.f );
|
||||
|
||||
for ( i = 0; i < n_iter; ++i )
|
||||
{
|
||||
x_vec.v = _mm256_loadu_pd( x );
|
||||
|
||||
// Get the absolute value of the vector element.
|
||||
x_vec.v = _mm256_andnot_pd( sign_mask.v, x_vec.v );
|
||||
|
||||
mask_vec.v = _mm256_cmp_pd( x_vec.v, max_vec.v, _CMP_GT_OS );
|
||||
|
||||
max_vec.v = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
|
||||
maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
|
||||
|
||||
idx_vec.v += inc_vec.v;
|
||||
x += num_vec_elements;
|
||||
}
|
||||
|
||||
max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 );
|
||||
max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 );
|
||||
mask_vec_lo.v = _mm_cmp_pd( max_vec_hi.v, max_vec_lo.v, _CMP_GT_OS );
|
||||
|
||||
max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
|
||||
maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
|
||||
maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
|
||||
|
||||
if ( max_vec_lo.d[0] > max_vec_lo.d[1] )
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.d[0];
|
||||
i_max_l = maxInx_vec_lo.d[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
abs_chi1_max = max_vec_lo.d[1];
|
||||
i_max_l = maxInx_vec_lo.d[1];
|
||||
}
|
||||
|
||||
for ( i = n - n_left; i < n; i++ )
|
||||
{
|
||||
double* chi1 = x;
|
||||
|
||||
/* Get the real and imaginary components of chi1. */
|
||||
chi1_r = *chi1;
|
||||
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */
|
||||
abs_chi1 = fabs( chi1_r );
|
||||
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */
|
||||
if ( abs_chi1_max < abs_chi1 || isnan( abs_chi1 ) )
|
||||
{
|
||||
abs_chi1_max = abs_chi1;
|
||||
i_max_l = i;
|
||||
}
|
||||
|
||||
x += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
/* Store final index to output variable. */
|
||||
*i_max = i_max_l;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
dim_t n, \
|
||||
ctype* x, inc_t incx, \
|
||||
dim_t* i_max, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* minus_one = PASTEMAC(chr,m1); \
|
||||
dim_t* zero_i = PASTEMAC(i,0); \
|
||||
\
|
||||
ctype_r chi1_r; \
|
||||
ctype_r chi1_i; \
|
||||
ctype_r abs_chi1; \
|
||||
ctype_r abs_chi1_max; \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( zero_i, *i_max ); \
|
||||
\
|
||||
/* If the vector length is zero, return early. This directly emulates
|
||||
the behavior of netlib BLAS's i?amax() routines. */ \
|
||||
if ( bli_zero_dim1( n ) ) return; \
|
||||
\
|
||||
/* Initialize the maximum absolute value search candidate with
|
||||
-1, which is guaranteed to be less than all values we will
|
||||
compute. */ \
|
||||
PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \
|
||||
\
|
||||
if ( incx == 1 ) \
|
||||
{ \
|
||||
for ( i = 0; i < n; ++i ) \
|
||||
{ \
|
||||
/* Get the real and imaginary components of chi1. */ \
|
||||
PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
|
||||
\
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */ \
|
||||
PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
|
||||
PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
|
||||
\
|
||||
/* Add the real and imaginary absolute values together. */ \
|
||||
PASTEMAC(chr,set0s)( abs_chi1 ); \
|
||||
PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
|
||||
PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
|
||||
\
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */ \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
*i_max = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( i = 0; i < n; ++i ) \
|
||||
{ \
|
||||
ctype* chi1 = x + (i )*incx; \
|
||||
\
|
||||
/* Get the real and imaginary components of chi1. */ \
|
||||
PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
|
||||
\
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */ \
|
||||
PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
|
||||
PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \
|
||||
\
|
||||
/* Add the real and imaginary absolute values together. */ \
|
||||
PASTEMAC(chr,set0s)( abs_chi1 ); \
|
||||
PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \
|
||||
PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \
|
||||
\
|
||||
/* If the absolute value of the current element exceeds that of
|
||||
the previous largest, save it and its index. If NaN is
|
||||
encountered, then treat it the same as if it were a valid
|
||||
value that was smaller than any previously seen. This
|
||||
behavior mimics that of LAPACK's ?lange(). */ \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
*i_max = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
GENTFUNCR( scomplex, float, c, s, amaxv_zen_int )
|
||||
GENTFUNCR( dcomplex, double, z, d, amaxv_zen_int )
|
||||
#endif
|
||||
262
kernels/zen/1/bli_axpyv_zen_int.c
Normal file
262
kernels/zen/1/bli_axpyv_zen_int.c
Normal file
@@ -0,0 +1,262 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_saxpyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
|
||||
v8sf_t alphav;
|
||||
v8sf_t x0v, x1v, x2v, x3v;
|
||||
v8sf_t y0v, y1v, y2v, y3v;
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav.v = _mm256_broadcast_ss( alpha );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
|
||||
y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
|
||||
y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_ps( alphav.v, x0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_ps( alphav.v, x1v.v, y1v.v );
|
||||
y2v.v = _mm256_fmadd_ps( alphav.v, x2v.v, y2v.v );
|
||||
y3v.v = _mm256_fmadd_ps( alphav.v, x3v.v, y3v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), y2v.v );
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), y3v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
y0 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
const float alphac = *alpha;
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
|
||||
*y0 += alphac * x0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_daxpyv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
|
||||
v4df_t alphav;
|
||||
v4df_t x0v, x1v, x2v, x3v;
|
||||
v4df_t y0v, y1v, y2v, y3v;
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav.v = _mm256_broadcast_sd( alpha );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
|
||||
y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
|
||||
y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( alphav.v, x0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( alphav.v, x1v.v, y1v.v );
|
||||
y2v.v = _mm256_fmadd_pd( alphav.v, x2v.v, y2v.v );
|
||||
y3v.v = _mm256_fmadd_pd( alphav.v, x3v.v, y3v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), y2v.v );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), y3v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
y0 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
const double alphac = *alpha;
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
|
||||
*y0 += alphac * x0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
466
kernels/zen/1/bli_axpyv_zen_int10.c
Normal file
466
kernels/zen/1/bli_axpyv_zen_int10.c
Normal file
@@ -0,0 +1,466 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_saxpyv_zen_int10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
|
||||
dim_t i;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
|
||||
__m256 alphav;
|
||||
__m256 xv[10];
|
||||
__m256 yv[10];
|
||||
__m256 zv[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_ss( alpha );
|
||||
|
||||
for ( i = 0; (i + 79) < n; i += 80 )
|
||||
{
|
||||
// 80 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
|
||||
yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
|
||||
yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
|
||||
zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] );
|
||||
zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] );
|
||||
zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] );
|
||||
zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] );
|
||||
zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
_mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] );
|
||||
_mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] );
|
||||
_mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] );
|
||||
_mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] );
|
||||
_mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 39) < n; i += 40 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 31) < n; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] );
|
||||
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
*y0 += (*alpha) * (*x0);
|
||||
|
||||
x0 += 1;
|
||||
y0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const float alphac = *alpha;
|
||||
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
|
||||
*y0 += alphac * x0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_daxpyv_zen_int10
|
||||
(
|
||||
conj_t conjx,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double* restrict x0 = x;
|
||||
double* restrict y0 = y;
|
||||
|
||||
__m256d alphav;
|
||||
__m256d xv[10];
|
||||
__m256d yv[10];
|
||||
__m256d zv[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_sd( alpha );
|
||||
|
||||
for ( i = 0; (i + 39) < n; i += 40 )
|
||||
{
|
||||
// 40 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
|
||||
yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
|
||||
yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
|
||||
zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] );
|
||||
zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] );
|
||||
zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] );
|
||||
zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] );
|
||||
zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
_mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] );
|
||||
_mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] );
|
||||
_mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] );
|
||||
_mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] );
|
||||
_mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 19) < n; i += 20 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
|
||||
zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] );
|
||||
zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; i + 7 < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; i + 3 < n; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] );
|
||||
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
for ( ; i < n; i += 1 )
|
||||
{
|
||||
*y0 += (*alpha) * (*x0);
|
||||
|
||||
y0 += 1;
|
||||
x0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const double alphac = *alpha;
|
||||
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
|
||||
*y0 += alphac * x0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
296
kernels/zen/1/bli_dotv_zen_int.c
Normal file
296
kernels/zen/1/bli_dotv_zen_int.c
Normal file
@@ -0,0 +1,296 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sdotv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
float rho0;
|
||||
|
||||
v8sf_t rho0v, rho1v, rho2v, rho3v;
|
||||
v8sf_t x0v, y0v;
|
||||
v8sf_t x1v, y1v;
|
||||
v8sf_t x2v, y2v;
|
||||
v8sf_t x3v, y3v;
|
||||
|
||||
// If the vector dimension is zero, set rho to zero and return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(s,set0s)( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Initialize the local scalar rho1 to zero.
|
||||
PASTEMAC(s,set0s)( rho0 );
|
||||
|
||||
// Initialize the unrolled iterations' rho vectors to zero.
|
||||
rho0v.v = _mm256_setzero_ps();
|
||||
rho1v.v = _mm256_setzero_ps();
|
||||
rho2v.v = _mm256_setzero_ps();
|
||||
rho3v.v = _mm256_setzero_ps();
|
||||
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the x and y input vector elements.
|
||||
x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
|
||||
x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
// Compute the element-wise product of the x and y vectors,
|
||||
// storing in the corresponding rho vectors.
|
||||
rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
|
||||
|
||||
x0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
y0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
}
|
||||
|
||||
// Accumulate the unrolled rho vectors into a single vector.
|
||||
rho0v.v += rho1v.v;
|
||||
rho0v.v += rho2v.v;
|
||||
rho0v.v += rho3v.v;
|
||||
|
||||
// Accumulate the final rho vector into a single scalar result.
|
||||
rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
|
||||
rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
const float y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
// Copy the final result into the output variable.
|
||||
PASTEMAC(s,copys)( rho0, *rho );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ddotv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
double rho0;
|
||||
|
||||
v4df_t rho0v, rho1v, rho2v, rho3v;
|
||||
v4df_t x0v, y0v;
|
||||
v4df_t x1v, y1v;
|
||||
v4df_t x2v, y2v;
|
||||
v4df_t x3v, y3v;
|
||||
|
||||
// If the vector dimension is zero, set rho to zero and return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(d,set0s)( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Initialize the local scalar rho1 to zero.
|
||||
PASTEMAC(d,set0s)( rho0 );
|
||||
|
||||
// Initialize the unrolled iterations' rho vectors to zero.
|
||||
rho0v.v = _mm256_setzero_pd();
|
||||
rho1v.v = _mm256_setzero_pd();
|
||||
rho2v.v = _mm256_setzero_pd();
|
||||
rho3v.v = _mm256_setzero_pd();
|
||||
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the x and y input vector elements.
|
||||
x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
|
||||
x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
|
||||
// Compute the element-wise product of the x and y vectors,
|
||||
// storing in the corresponding rho vectors.
|
||||
rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
|
||||
|
||||
x0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
y0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
}
|
||||
|
||||
// Accumulate the unrolled rho vectors into a single vector.
|
||||
rho0v.v += rho1v.v;
|
||||
rho0v.v += rho2v.v;
|
||||
rho0v.v += rho3v.v;
|
||||
|
||||
// Accumulate the final rho vector into a single scalar result.
|
||||
rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
const double y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
// Copy the final result into the output variable.
|
||||
PASTEMAC(d,copys)( rho0, *rho );
|
||||
}
|
||||
|
||||
440
kernels/zen/1/bli_dotv_zen_int10.c
Normal file
440
kernels/zen/1/bli_dotv_zen_int10.c
Normal file
@@ -0,0 +1,440 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sdotv_zen_int10
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
|
||||
dim_t i;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
|
||||
float rho0;
|
||||
|
||||
__m256 xv[10];
|
||||
__m256 yv[10];
|
||||
v8sf_t rhov[2];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(s,set0s)( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
PASTEMAC(s,set0s)( rho0 );
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
rhov[0].v = _mm256_setzero_ps();
|
||||
rhov[1].v = _mm256_setzero_ps();
|
||||
|
||||
for ( i = 0; (i + 79) < n; i += 80 )
|
||||
{
|
||||
// 80 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg );
|
||||
yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg );
|
||||
yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[1].v );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 39) < n; i += 40 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[0].v );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 31) < n; i += 32 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[1].v );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
rhov[0].f[0] += x0[i] * y0[i];
|
||||
}
|
||||
|
||||
v8sf_t onev;
|
||||
|
||||
onev.v = _mm256_set1_ps( 1.0f );
|
||||
|
||||
rhov[0].v = _mm256_dp_ps( rhov[0].v, onev.v, 0xf1 );
|
||||
rhov[1].v = _mm256_dp_ps( rhov[1].v, onev.v, 0xf1 );
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 += rhov[0].f[0] + rhov[0].f[4];
|
||||
rho0 += rhov[1].f[0] + rhov[1].f[4];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
const float y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the final result into the output variable.
|
||||
PASTEMAC(s,copys)( rho0, *rho );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ddotv_zen_int10
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
|
||||
double rho0;
|
||||
|
||||
__m256d xv[10];
|
||||
__m256d yv[10];
|
||||
v4df_t rhov[2];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(d,set0s)( *rho );
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
PASTEMAC(d,set0s)( rho0 );
|
||||
|
||||
if ( incx == 1 && incy == 1 )
|
||||
{
|
||||
rhov[0].v = _mm256_setzero_pd();
|
||||
rhov[1].v = _mm256_setzero_pd();
|
||||
|
||||
for ( i = 0; (i + 39) < n; i += 40 )
|
||||
{
|
||||
// 80 elements will be processed per loop; 10 FMAs will run per loop.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg );
|
||||
yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg );
|
||||
yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg );
|
||||
yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg );
|
||||
yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[1].v );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
y0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 19) < n; i += 20 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[0].v );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
y0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[1].v );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
y0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
y0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 3) < n; i += 4 )
|
||||
{
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
y0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
rhov[0].d[0] += x0[i] * y0[i];
|
||||
}
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3];
|
||||
rho0 += rhov[1].d[0] + rhov[1].d[1] + rhov[1].d[2] + rhov[1].d[3];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
const double y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the final result into the output variable.
|
||||
PASTEMAC(d,copys)( rho0, *rho );
|
||||
}
|
||||
|
||||
308
kernels/zen/1/bli_dotxv_zen_int.c
Normal file
308
kernels/zen/1/bli_dotxv_zen_int.c
Normal file
@@ -0,0 +1,308 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sdotxv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
float* restrict beta,
|
||||
float* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
float* restrict x0;
|
||||
float* restrict y0;
|
||||
float rho0;
|
||||
|
||||
v8sf_t rho0v, rho1v, rho2v, rho3v;
|
||||
v8sf_t x0v, y0v;
|
||||
v8sf_t x1v, y1v;
|
||||
v8sf_t x2v, y2v;
|
||||
v8sf_t x3v, y3v;
|
||||
|
||||
// If beta is zero, initialize rho1 to zero instead of scaling
|
||||
// rho by beta (in case rho contains NaN or Inf).
|
||||
if ( PASTEMAC(s,eq0)( *beta ) )
|
||||
{
|
||||
PASTEMAC(s,set0s)( *rho );
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC(s,scals)( *beta, *rho );
|
||||
}
|
||||
|
||||
// If the vector dimension is zero, output rho and return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Initialize the unrolled iterations' rho vectors to zero.
|
||||
rho0v.v = _mm256_setzero_ps();
|
||||
rho1v.v = _mm256_setzero_ps();
|
||||
rho2v.v = _mm256_setzero_ps();
|
||||
rho3v.v = _mm256_setzero_ps();
|
||||
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the x and y input vector elements.
|
||||
x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
|
||||
x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
|
||||
|
||||
x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg );
|
||||
|
||||
x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg );
|
||||
|
||||
// Compute the element-wise product of the x and y vectors,
|
||||
// storing in the corresponding rho vectors.
|
||||
rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v );
|
||||
|
||||
x0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
y0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
}
|
||||
|
||||
// Accumulate the unrolled rho vectors into a single vector.
|
||||
rho0v.v += rho1v.v;
|
||||
rho0v.v += rho2v.v;
|
||||
rho0v.v += rho3v.v;
|
||||
|
||||
// Accumulate the final rho vector into a single scalar result.
|
||||
rho0 = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
|
||||
rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
const float y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
// Accumulate the final result into the output variable.
|
||||
PASTEMAC(s,axpys)( *alpha, rho0, *rho );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ddotxv_zen_int
|
||||
(
|
||||
conj_t conjx,
|
||||
conj_t conjy,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
double* restrict beta,
|
||||
double* restrict rho,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
double* restrict x0;
|
||||
double* restrict y0;
|
||||
double rho0;
|
||||
|
||||
v4df_t rho0v, rho1v, rho2v, rho3v;
|
||||
v4df_t x0v, y0v;
|
||||
v4df_t x1v, y1v;
|
||||
v4df_t x2v, y2v;
|
||||
v4df_t x3v, y3v;
|
||||
|
||||
// If beta is zero, initialize rho1 to zero instead of scaling
|
||||
// rho by beta (in case rho contains NaN or Inf).
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
PASTEMAC(d,set0s)( *rho );
|
||||
}
|
||||
else
|
||||
{
|
||||
PASTEMAC(d,scals)( *beta, *rho );
|
||||
}
|
||||
|
||||
// If the vector dimension is zero, output rho and return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
y0 = y;
|
||||
|
||||
// Initialize the unrolled iterations' rho vectors to zero.
|
||||
rho0v.v = _mm256_setzero_pd();
|
||||
rho1v.v = _mm256_setzero_pd();
|
||||
rho2v.v = _mm256_setzero_pd();
|
||||
rho3v.v = _mm256_setzero_pd();
|
||||
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the x and y input vector elements.
|
||||
x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
|
||||
x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
|
||||
|
||||
x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
|
||||
|
||||
x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
|
||||
|
||||
// Compute the element-wise product of the x and y vectors,
|
||||
// storing in the corresponding rho vectors.
|
||||
rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v );
|
||||
|
||||
x0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
y0 += ( n_elem_per_reg * n_iter_unroll );
|
||||
}
|
||||
|
||||
// Accumulate the unrolled rho vectors into a single vector.
|
||||
rho0v.v += rho1v.v;
|
||||
rho0v.v += rho2v.v;
|
||||
rho0v.v += rho3v.v;
|
||||
|
||||
// Accumulate the final rho vector into a single scalar result.
|
||||
rho0 = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
const double y0c = *y0;
|
||||
|
||||
rho0 += x0c * y0c;
|
||||
|
||||
x0 += incx;
|
||||
y0 += incy;
|
||||
}
|
||||
|
||||
// Accumulate the final result into the output variable.
|
||||
PASTEMAC(d,axpys)( *alpha, rho0, *rho );
|
||||
}
|
||||
|
||||
266
kernels/zen/1/bli_scalv_zen_int.c
Normal file
266
kernels/zen/1/bli_scalv_zen_int.c
Normal file
@@ -0,0 +1,266 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sscalv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
float* restrict x0;
|
||||
|
||||
v8sf_t alphav;
|
||||
v8sf_t x0v, x1v, x2v, x3v;
|
||||
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
ssetv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav.v = _mm256_broadcast_ss( alpha );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
x0v.v = _mm256_mul_ps( alphav.v, x0v.v );
|
||||
x1v.v = _mm256_mul_ps( alphav.v, x1v.v );
|
||||
x2v.v = _mm256_mul_ps( alphav.v, x2v.v );
|
||||
x3v.v = _mm256_mul_ps( alphav.v, x3v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), x0v.v );
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), x1v.v );
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), x2v.v );
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), x3v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
const float alphac = *alpha;
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
x0 += incx;
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_dscalv_zen_int
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 4;
|
||||
|
||||
dim_t i;
|
||||
dim_t n_viter;
|
||||
dim_t n_left;
|
||||
|
||||
double* restrict x0;
|
||||
|
||||
v4df_t alphav;
|
||||
v4df_t x0v, x1v, x2v, x3v;
|
||||
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
dsetv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override n_viter and n_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( incx != 1 )
|
||||
{
|
||||
n_viter = 0;
|
||||
n_left = n;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav.v = _mm256_broadcast_sd( alpha );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < n_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
x0v.v = _mm256_mul_pd( alphav.v, x0v.v );
|
||||
x1v.v = _mm256_mul_pd( alphav.v, x1v.v );
|
||||
x2v.v = _mm256_mul_pd( alphav.v, x2v.v );
|
||||
x3v.v = _mm256_mul_pd( alphav.v, x3v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), x0v.v );
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), x1v.v );
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), x2v.v );
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), x3v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
const double alphac = *alpha;
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < n_left; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
x0 += incx;
|
||||
}
|
||||
}
|
||||
|
||||
448
kernels/zen/1/bli_scalv_zen_int10.c
Normal file
448
kernels/zen/1/bli_scalv_zen_int10.c
Normal file
@@ -0,0 +1,448 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sscalv_zen_int10
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
float* restrict alpha,
|
||||
float* restrict x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
|
||||
dim_t i;
|
||||
|
||||
float* restrict x0;
|
||||
|
||||
__m256 alphav;
|
||||
__m256 xv[10];
|
||||
__m256 zv[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
float* zero = bli_s0;
|
||||
ssetv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
if ( incx == 1 )
|
||||
{
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_ss( alpha );
|
||||
|
||||
for ( i = 0; (i + 79) < n; i += 80 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_ps( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_ps( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_ps( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_ps( alphav, xv[3] );
|
||||
zv[4] = _mm256_mul_ps( alphav, xv[4] );
|
||||
zv[5] = _mm256_mul_ps( alphav, xv[5] );
|
||||
zv[6] = _mm256_mul_ps( alphav, xv[6] );
|
||||
zv[7] = _mm256_mul_ps( alphav, xv[7] );
|
||||
zv[8] = _mm256_mul_ps( alphav, xv[8] );
|
||||
zv[9] = _mm256_mul_ps( alphav, xv[9] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
|
||||
_mm256_storeu_ps( (x0 + 5*n_elem_per_reg), zv[5] );
|
||||
_mm256_storeu_ps( (x0 + 6*n_elem_per_reg), zv[6] );
|
||||
_mm256_storeu_ps( (x0 + 7*n_elem_per_reg), zv[7] );
|
||||
_mm256_storeu_ps( (x0 + 8*n_elem_per_reg), zv[8] );
|
||||
_mm256_storeu_ps( (x0 + 9*n_elem_per_reg), zv[9] );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 39) < n; i += 40 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_ps( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_ps( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_ps( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_ps( alphav, xv[3] );
|
||||
zv[4] = _mm256_mul_ps( alphav, xv[4] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 31) < n; i += 32 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_ps( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_ps( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_ps( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_ps( alphav, xv[3] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_ps( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_ps( alphav, xv[1] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_ps( alphav, xv[0] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
*x0 *= *alpha;
|
||||
|
||||
x0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const float alphac = *alpha;
|
||||
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
x0 += incx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_dscalv_zen_int10
|
||||
(
|
||||
conj_t conjalpha,
|
||||
dim_t n,
|
||||
double* restrict alpha,
|
||||
double* restrict x, inc_t incx,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
|
||||
dim_t i;
|
||||
|
||||
double* restrict x0;
|
||||
|
||||
__m256d alphav;
|
||||
__m256d xv[10];
|
||||
__m256d zv[10];
|
||||
|
||||
// If the vector dimension is zero, or if alpha is unit, return early.
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return;
|
||||
|
||||
// If alpha is zero, use setv.
|
||||
if ( PASTEMAC(d,eq0)( *alpha ) )
|
||||
{
|
||||
double* zero = bli_d0;
|
||||
dsetv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n,
|
||||
zero,
|
||||
x, incx,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize local pointers.
|
||||
x0 = x;
|
||||
|
||||
if ( incx == 1 )
|
||||
{
|
||||
// Broadcast the alpha scalar to all elements of a vector register.
|
||||
alphav = _mm256_broadcast_sd( alpha );
|
||||
|
||||
for ( i = 0; (i + 39) < n; i += 40 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg );
|
||||
xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg );
|
||||
xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg );
|
||||
xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg );
|
||||
xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_pd( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_pd( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_pd( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_pd( alphav, xv[3] );
|
||||
zv[4] = _mm256_mul_pd( alphav, xv[4] );
|
||||
zv[5] = _mm256_mul_pd( alphav, xv[5] );
|
||||
zv[6] = _mm256_mul_pd( alphav, xv[6] );
|
||||
zv[7] = _mm256_mul_pd( alphav, xv[7] );
|
||||
zv[8] = _mm256_mul_pd( alphav, xv[8] );
|
||||
zv[9] = _mm256_mul_pd( alphav, xv[9] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
|
||||
_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), zv[5] );
|
||||
_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), zv[6] );
|
||||
_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), zv[7] );
|
||||
_mm256_storeu_pd( (x0 + 8*n_elem_per_reg), zv[8] );
|
||||
_mm256_storeu_pd( (x0 + 9*n_elem_per_reg), zv[9] );
|
||||
|
||||
x0 += 10*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 19) < n; i += 20 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_pd( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_pd( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_pd( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_pd( alphav, xv[3] );
|
||||
zv[4] = _mm256_mul_pd( alphav, xv[4] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] );
|
||||
|
||||
x0 += 5*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 15) < n; i += 16 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg );
|
||||
xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_pd( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_pd( alphav, xv[1] );
|
||||
zv[2] = _mm256_mul_pd( alphav, xv[2] );
|
||||
zv[3] = _mm256_mul_pd( alphav, xv[3] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] );
|
||||
_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] );
|
||||
|
||||
x0 += 4*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 7) < n; i += 8 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_pd( alphav, xv[0] );
|
||||
zv[1] = _mm256_mul_pd( alphav, xv[1] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] );
|
||||
|
||||
x0 += 2*n_elem_per_reg;
|
||||
}
|
||||
|
||||
for ( ; (i + 3) < n; i += 4 )
|
||||
{
|
||||
// Load the input values.
|
||||
xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : x := alpha * x;
|
||||
zv[0] = _mm256_mul_pd( alphav, xv[0] );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] );
|
||||
|
||||
x0 += 1*n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
{
|
||||
*x0 *= *alpha;
|
||||
|
||||
x0 += 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const double alphac = *alpha;
|
||||
|
||||
for ( i = 0; i < n; ++i )
|
||||
{
|
||||
*x0 *= alphac;
|
||||
|
||||
x0 += incx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
485
kernels/zen/1f/bli_axpyf_zen_int_8.c
Normal file
485
kernels/zen/1f/bli_axpyf_zen_int_8.c
Normal file
@@ -0,0 +1,485 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_saxpyf_zen_int_8
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 8;
|
||||
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
dim_t i;
|
||||
dim_t m_viter;
|
||||
dim_t m_left;
|
||||
|
||||
float* restrict a0;
|
||||
float* restrict a1;
|
||||
float* restrict a2;
|
||||
float* restrict a3;
|
||||
float* restrict a4;
|
||||
float* restrict a5;
|
||||
float* restrict a6;
|
||||
float* restrict a7;
|
||||
|
||||
float* restrict y0;
|
||||
|
||||
v8sf_t chi0v, chi1v, chi2v, chi3v;
|
||||
v8sf_t chi4v, chi5v, chi6v, chi7v;
|
||||
|
||||
v8sf_t a0v, a1v, a2v, a3v;
|
||||
v8sf_t a4v, a5v, a6v, a7v;
|
||||
v8sf_t y0v;
|
||||
|
||||
float chi0, chi1, chi2, chi3;
|
||||
float chi4, chi5, chi6, chi7;
|
||||
|
||||
// If either dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim2( m, b_n ) || PASTEMAC(s,eq0)( *alpha ) ) return;
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
saxpyv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* chi1 = x + (i )*incx;
|
||||
float* y1 = y + (0 )*incy;
|
||||
float alpha_chi1;
|
||||
|
||||
PASTEMAC(s,copycjs)( conjx, *chi1, alpha_chi1 );
|
||||
PASTEMAC(s,scals)( *alpha, alpha_chi1 );
|
||||
|
||||
f
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override m_viter and m_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( inca != 1 || incy != 1 )
|
||||
{
|
||||
m_viter = 0;
|
||||
m_left = m;
|
||||
}
|
||||
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
a5 = a + 5*lda;
|
||||
a6 = a + 6*lda;
|
||||
a7 = a + 7*lda;
|
||||
y0 = y;
|
||||
|
||||
chi0 = *( x + 0*incx );
|
||||
chi1 = *( x + 1*incx );
|
||||
chi2 = *( x + 2*incx );
|
||||
chi3 = *( x + 3*incx );
|
||||
chi4 = *( x + 4*incx );
|
||||
chi5 = *( x + 5*incx );
|
||||
chi6 = *( x + 6*incx );
|
||||
chi7 = *( x + 7*incx );
|
||||
|
||||
// Scale each chi scalar by alpha.
|
||||
PASTEMAC(s,scals)( *alpha, chi0 );
|
||||
PASTEMAC(s,scals)( *alpha, chi1 );
|
||||
PASTEMAC(s,scals)( *alpha, chi2 );
|
||||
PASTEMAC(s,scals)( *alpha, chi3 );
|
||||
PASTEMAC(s,scals)( *alpha, chi4 );
|
||||
PASTEMAC(s,scals)( *alpha, chi5 );
|
||||
PASTEMAC(s,scals)( *alpha, chi6 );
|
||||
PASTEMAC(s,scals)( *alpha, chi7 );
|
||||
|
||||
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
|
||||
chi0v.v = _mm256_broadcast_ss( &chi0 );
|
||||
chi1v.v = _mm256_broadcast_ss( &chi1 );
|
||||
chi2v.v = _mm256_broadcast_ss( &chi2 );
|
||||
chi3v.v = _mm256_broadcast_ss( &chi3 );
|
||||
chi4v.v = _mm256_broadcast_ss( &chi4 );
|
||||
chi5v.v = _mm256_broadcast_ss( &chi5 );
|
||||
chi6v.v = _mm256_broadcast_ss( &chi6 );
|
||||
chi7v.v = _mm256_broadcast_ss( &chi7 );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < m_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
|
||||
a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
|
||||
a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
|
||||
a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
|
||||
a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
|
||||
a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
|
||||
a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
|
||||
a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
|
||||
a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_ps( a0v.v, chi0v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a1v.v, chi1v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a2v.v, chi2v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a3v.v, chi3v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a4v.v, chi4v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a5v.v, chi5v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a6v.v, chi6v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( a7v.v, chi7v.v, y0v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
|
||||
y0 += n_elem_per_reg;
|
||||
a0 += n_elem_per_reg;
|
||||
a1 += n_elem_per_reg;
|
||||
a2 += n_elem_per_reg;
|
||||
a3 += n_elem_per_reg;
|
||||
a4 += n_elem_per_reg;
|
||||
a5 += n_elem_per_reg;
|
||||
a6 += n_elem_per_reg;
|
||||
a7 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the m_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < m_left ; ++i )
|
||||
{
|
||||
float y0c = *y0;
|
||||
|
||||
const float a0c = *a0;
|
||||
const float a1c = *a1;
|
||||
const float a2c = *a2;
|
||||
const float a3c = *a3;
|
||||
const float a4c = *a4;
|
||||
const float a5c = *a5;
|
||||
const float a6c = *a6;
|
||||
const float a7c = *a7;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
y0c += chi5 * a5c;
|
||||
y0c += chi6 * a6c;
|
||||
y0c += chi7 * a7c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
a5 += inca;
|
||||
a6 += inca;
|
||||
a7 += inca;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_daxpyf_zen_int_8
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 8;
|
||||
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
dim_t i;
|
||||
dim_t m_viter;
|
||||
dim_t m_left;
|
||||
|
||||
double* restrict a0;
|
||||
double* restrict a1;
|
||||
double* restrict a2;
|
||||
double* restrict a3;
|
||||
double* restrict a4;
|
||||
double* restrict a5;
|
||||
double* restrict a6;
|
||||
double* restrict a7;
|
||||
|
||||
double* restrict y0;
|
||||
|
||||
v4df_t chi0v, chi1v, chi2v, chi3v;
|
||||
v4df_t chi4v, chi5v, chi6v, chi7v;
|
||||
|
||||
v4df_t a0v, a1v, a2v, a3v;
|
||||
v4df_t a4v, a5v, a6v, a7v;
|
||||
v4df_t y0v;
|
||||
|
||||
double chi0, chi1, chi2, chi3;
|
||||
double chi4, chi5, chi6, chi7;
|
||||
|
||||
// If either dimension is zero, or if alpha is zero, return early.
|
||||
if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return;
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over axpyv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
daxpyv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* chi1 = x + (i )*incx;
|
||||
double* y1 = y + (0 )*incy;
|
||||
double alpha_chi1;
|
||||
|
||||
PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 );
|
||||
PASTEMAC(d,scals)( *alpha, alpha_chi1 );
|
||||
|
||||
f
|
||||
(
|
||||
conja,
|
||||
m,
|
||||
&alpha_chi1,
|
||||
a1, inca,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override m_viter and m_left to use scalar code
|
||||
// for all iterations.
|
||||
if ( inca != 1 || incy != 1 )
|
||||
{
|
||||
m_viter = 0;
|
||||
m_left = m;
|
||||
}
|
||||
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
a5 = a + 5*lda;
|
||||
a6 = a + 6*lda;
|
||||
a7 = a + 7*lda;
|
||||
y0 = y;
|
||||
|
||||
chi0 = *( x + 0*incx );
|
||||
chi1 = *( x + 1*incx );
|
||||
chi2 = *( x + 2*incx );
|
||||
chi3 = *( x + 3*incx );
|
||||
chi4 = *( x + 4*incx );
|
||||
chi5 = *( x + 5*incx );
|
||||
chi6 = *( x + 6*incx );
|
||||
chi7 = *( x + 7*incx );
|
||||
|
||||
// Scale each chi scalar by alpha.
|
||||
PASTEMAC(d,scals)( *alpha, chi0 );
|
||||
PASTEMAC(d,scals)( *alpha, chi1 );
|
||||
PASTEMAC(d,scals)( *alpha, chi2 );
|
||||
PASTEMAC(d,scals)( *alpha, chi3 );
|
||||
PASTEMAC(d,scals)( *alpha, chi4 );
|
||||
PASTEMAC(d,scals)( *alpha, chi5 );
|
||||
PASTEMAC(d,scals)( *alpha, chi6 );
|
||||
PASTEMAC(d,scals)( *alpha, chi7 );
|
||||
|
||||
// Broadcast the (alpha*chi?) scalars to all elements of vector registers.
|
||||
chi0v.v = _mm256_broadcast_sd( &chi0 );
|
||||
chi1v.v = _mm256_broadcast_sd( &chi1 );
|
||||
chi2v.v = _mm256_broadcast_sd( &chi2 );
|
||||
chi3v.v = _mm256_broadcast_sd( &chi3 );
|
||||
chi4v.v = _mm256_broadcast_sd( &chi4 );
|
||||
chi5v.v = _mm256_broadcast_sd( &chi5 );
|
||||
chi6v.v = _mm256_broadcast_sd( &chi6 );
|
||||
chi7v.v = _mm256_broadcast_sd( &chi7 );
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < m_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
|
||||
a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
|
||||
a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
|
||||
a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : y += alpha * x;
|
||||
y0v.v = _mm256_fmadd_pd( a0v.v, chi0v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a1v.v, chi1v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a2v.v, chi2v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a3v.v, chi3v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a4v.v, chi4v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a5v.v, chi5v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a6v.v, chi6v.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_pd( a7v.v, chi7v.v, y0v.v );
|
||||
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
|
||||
|
||||
y0 += n_elem_per_reg;
|
||||
a0 += n_elem_per_reg;
|
||||
a1 += n_elem_per_reg;
|
||||
a2 += n_elem_per_reg;
|
||||
a3 += n_elem_per_reg;
|
||||
a4 += n_elem_per_reg;
|
||||
a5 += n_elem_per_reg;
|
||||
a6 += n_elem_per_reg;
|
||||
a7 += n_elem_per_reg;
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the m_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < m_left ; ++i )
|
||||
{
|
||||
double y0c = *y0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
const double a5c = *a5;
|
||||
const double a6c = *a6;
|
||||
const double a7c = *a7;
|
||||
|
||||
y0c += chi0 * a0c;
|
||||
y0c += chi1 * a1c;
|
||||
y0c += chi2 * a2c;
|
||||
y0c += chi3 * a3c;
|
||||
y0c += chi4 * a4c;
|
||||
y0c += chi5 * a5c;
|
||||
y0c += chi6 * a6c;
|
||||
y0c += chi7 * a7c;
|
||||
|
||||
*y0 = y0c;
|
||||
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
a5 += inca;
|
||||
a6 += inca;
|
||||
a7 += inca;
|
||||
y0 += incy;
|
||||
}
|
||||
}
|
||||
|
||||
702
kernels/zen/1f/bli_dotxf_zen_int_8.c
Normal file
702
kernels/zen/1f/bli_dotxf_zen_int_8.c
Normal file
@@ -0,0 +1,702 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "immintrin.h"
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
One 256-bit AVX register holds 8 SP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256 v;
|
||||
float f[8] __attribute__((aligned(64)));
|
||||
} v8sf_t;
|
||||
|
||||
/* Union data structure to access AVX registers
|
||||
* One 256-bit AVX register holds 4 DP elements. */
|
||||
typedef union
|
||||
{
|
||||
__m256d v;
|
||||
double d[4] __attribute__((aligned(64)));
|
||||
} v4df_t;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_sdotxf_zen_int_8
|
||||
(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
float* restrict alpha,
|
||||
float* restrict a, inc_t inca, inc_t lda,
|
||||
float* restrict x, inc_t incx,
|
||||
float* restrict beta,
|
||||
float* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 8;
|
||||
|
||||
const dim_t n_elem_per_reg = 8;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
dim_t i;
|
||||
dim_t m_viter;
|
||||
dim_t m_left;
|
||||
|
||||
float* restrict a0;
|
||||
float* restrict a1;
|
||||
float* restrict a2;
|
||||
float* restrict a3;
|
||||
float* restrict a4;
|
||||
float* restrict a5;
|
||||
float* restrict a6;
|
||||
float* restrict a7;
|
||||
|
||||
float* restrict x0;
|
||||
|
||||
float rho0, rho1, rho2, rho3;
|
||||
float rho4, rho5, rho6, rho7;
|
||||
|
||||
v8sf_t a0v, a1v, a2v, a3v;
|
||||
v8sf_t a4v, a5v, a6v, a7v;
|
||||
|
||||
v8sf_t x0v;
|
||||
|
||||
v8sf_t rho0v, rho1v, rho2v, rho3v;
|
||||
v8sf_t rho4v, rho5v, rho6v, rho7v;
|
||||
|
||||
v8sf_t alphav;
|
||||
v8sf_t betav;
|
||||
v8sf_t y0v;
|
||||
|
||||
v8sf_t onev;
|
||||
|
||||
// If the b_n dimension is zero, y is empty and there is no computation.
|
||||
if ( bli_zero_dim1( b_n ) ) return;
|
||||
|
||||
// If the m dimension is zero, or if alpha is zero, the computation
|
||||
// simplifies to updating y.
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
|
||||
{
|
||||
sscalv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over dotxv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
sdotxv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
float* a1 = a + (0 )*inca + (i )*lda;
|
||||
float* x1 = x + (0 )*incx;
|
||||
float* psi1 = y + (i )*incy;
|
||||
|
||||
f
|
||||
(
|
||||
conjat,
|
||||
conjx,
|
||||
m,
|
||||
alpha,
|
||||
a1, inca,
|
||||
x1, incx,
|
||||
beta,
|
||||
psi1,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
// However, m may not be a multiple of the number of elements per vector.
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override m_viter and m_left to use scalar code
|
||||
// for all iterations. (NOTE: vector instructions can be used even if
|
||||
// the increment of the output vector, incy, is nonunit.)
|
||||
if ( inca != 1 || incx != 1 )
|
||||
{
|
||||
m_viter = 0;
|
||||
m_left = m;
|
||||
}
|
||||
|
||||
// Set up pointers for x and the b_n columns of A (rows of A^T).
|
||||
x0 = x;
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
a5 = a + 5*lda;
|
||||
a6 = a + 6*lda;
|
||||
a7 = a + 7*lda;
|
||||
|
||||
// Initialize b_n rho vector accumulators to zero.
|
||||
rho0v.v = _mm256_setzero_ps();
|
||||
rho1v.v = _mm256_setzero_ps();
|
||||
rho2v.v = _mm256_setzero_ps();
|
||||
rho3v.v = _mm256_setzero_ps();
|
||||
rho4v.v = _mm256_setzero_ps();
|
||||
rho5v.v = _mm256_setzero_ps();
|
||||
rho6v.v = _mm256_setzero_ps();
|
||||
rho7v.v = _mm256_setzero_ps();
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < m_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg );
|
||||
|
||||
a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
|
||||
a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
|
||||
a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
|
||||
a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
|
||||
a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
|
||||
a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg );
|
||||
a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg );
|
||||
a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : rho?v += a?v * x0v;
|
||||
rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_ps( a1v.v, x0v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_ps( a2v.v, x0v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_ps( a3v.v, x0v.v, rho3v.v );
|
||||
rho4v.v = _mm256_fmadd_ps( a4v.v, x0v.v, rho4v.v );
|
||||
rho5v.v = _mm256_fmadd_ps( a5v.v, x0v.v, rho5v.v );
|
||||
rho6v.v = _mm256_fmadd_ps( a6v.v, x0v.v, rho6v.v );
|
||||
rho7v.v = _mm256_fmadd_ps( a7v.v, x0v.v, rho7v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
a0 += n_elem_per_reg * n_iter_unroll;
|
||||
a1 += n_elem_per_reg * n_iter_unroll;
|
||||
a2 += n_elem_per_reg * n_iter_unroll;
|
||||
a3 += n_elem_per_reg * n_iter_unroll;
|
||||
a4 += n_elem_per_reg * n_iter_unroll;
|
||||
a5 += n_elem_per_reg * n_iter_unroll;
|
||||
a6 += n_elem_per_reg * n_iter_unroll;
|
||||
a7 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
#if 0
|
||||
rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] +
|
||||
rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7];
|
||||
rho1 += rho1v.f[0] + rho1v.f[1] + rho1v.f[2] + rho1v.f[3] +
|
||||
rho1v.f[4] + rho1v.f[5] + rho1v.f[6] + rho1v.f[7];
|
||||
rho2 += rho2v.f[0] + rho2v.f[1] + rho2v.f[2] + rho2v.f[3] +
|
||||
rho2v.f[4] + rho2v.f[5] + rho2v.f[6] + rho2v.f[7];
|
||||
rho3 += rho3v.f[0] + rho3v.f[1] + rho3v.f[2] + rho3v.f[3] +
|
||||
rho3v.f[4] + rho3v.f[5] + rho3v.f[6] + rho3v.f[7];
|
||||
rho4 += rho4v.f[0] + rho4v.f[1] + rho4v.f[2] + rho4v.f[3] +
|
||||
rho4v.f[4] + rho4v.f[5] + rho4v.f[6] + rho4v.f[7];
|
||||
rho5 += rho5v.f[0] + rho5v.f[1] + rho5v.f[2] + rho5v.f[3] +
|
||||
rho5v.f[4] + rho5v.f[5] + rho5v.f[6] + rho5v.f[7];
|
||||
rho6 += rho6v.f[0] + rho6v.f[1] + rho6v.f[2] + rho6v.f[3] +
|
||||
rho6v.f[4] + rho6v.f[5] + rho6v.f[6] + rho6v.f[7];
|
||||
rho7 += rho7v.f[0] + rho7v.f[1] + rho7v.f[2] + rho7v.f[3] +
|
||||
rho7v.f[4] + rho7v.f[5] + rho7v.f[6] + rho7v.f[7];
|
||||
#else
|
||||
// Now we need to sum the elements within each vector.
|
||||
|
||||
onev.v = _mm256_set1_ps( 1.0f );
|
||||
|
||||
// Sum the elements of a given rho?v by dotting it with 1. The '1' in
|
||||
// '0xf1' stores the sum of the upper four and lower four values to
|
||||
// the low elements of each lane: elements 4 and 0, respectively. (The
|
||||
// 'f' in '0xf1' means include all four elements of each lane in the
|
||||
// summation.)
|
||||
rho0v.v = _mm256_dp_ps( rho0v.v, onev.v, 0xf1 );
|
||||
rho1v.v = _mm256_dp_ps( rho1v.v, onev.v, 0xf1 );
|
||||
rho2v.v = _mm256_dp_ps( rho2v.v, onev.v, 0xf1 );
|
||||
rho3v.v = _mm256_dp_ps( rho3v.v, onev.v, 0xf1 );
|
||||
rho4v.v = _mm256_dp_ps( rho4v.v, onev.v, 0xf1 );
|
||||
rho5v.v = _mm256_dp_ps( rho5v.v, onev.v, 0xf1 );
|
||||
rho6v.v = _mm256_dp_ps( rho6v.v, onev.v, 0xf1 );
|
||||
rho7v.v = _mm256_dp_ps( rho7v.v, onev.v, 0xf1 );
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the m_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 = rho0v.f[0] + rho0v.f[4];
|
||||
rho1 = rho1v.f[0] + rho1v.f[4];
|
||||
rho2 = rho2v.f[0] + rho2v.f[4];
|
||||
rho3 = rho3v.f[0] + rho3v.f[4];
|
||||
rho4 = rho4v.f[0] + rho4v.f[4];
|
||||
rho5 = rho5v.f[0] + rho5v.f[4];
|
||||
rho6 = rho6v.f[0] + rho6v.f[4];
|
||||
rho7 = rho7v.f[0] + rho7v.f[4];
|
||||
#endif
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < m_left ; ++i )
|
||||
{
|
||||
const float x0c = *x0;
|
||||
|
||||
const float a0c = *a0;
|
||||
const float a1c = *a1;
|
||||
const float a2c = *a2;
|
||||
const float a3c = *a3;
|
||||
const float a4c = *a4;
|
||||
const float a5c = *a5;
|
||||
const float a6c = *a6;
|
||||
const float a7c = *a7;
|
||||
|
||||
rho0 += a0c * x0c;
|
||||
rho1 += a1c * x0c;
|
||||
rho2 += a2c * x0c;
|
||||
rho3 += a3c * x0c;
|
||||
rho4 += a4c * x0c;
|
||||
rho5 += a5c * x0c;
|
||||
rho6 += a6c * x0c;
|
||||
rho7 += a7c * x0c;
|
||||
|
||||
x0 += incx;
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
a5 += inca;
|
||||
a6 += inca;
|
||||
a7 += inca;
|
||||
}
|
||||
|
||||
// Insert the scalar rho values into a single vector.
|
||||
rho0v.f[0] = rho0;
|
||||
rho0v.f[1] = rho1;
|
||||
rho0v.f[2] = rho2;
|
||||
rho0v.f[3] = rho3;
|
||||
rho0v.f[4] = rho4;
|
||||
rho0v.f[5] = rho5;
|
||||
rho0v.f[6] = rho6;
|
||||
rho0v.f[7] = rho7;
|
||||
|
||||
// Broadcast the alpha scalar.
|
||||
alphav.v = _mm256_broadcast_ss( alpha );
|
||||
|
||||
// We know at this point that alpha is nonzero; however, beta may still
|
||||
// be zero. If beta is indeed zero, we must overwrite y rather than scale
|
||||
// by beta (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(s,eq0)( *beta ) )
|
||||
{
|
||||
// Apply alpha to the accumulated dot product in rho:
|
||||
// y := alpha * rho
|
||||
y0v.v = _mm256_mul_ps( alphav.v, rho0v.v );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Broadcast the beta scalar.
|
||||
betav.v = _mm256_broadcast_ss( beta );
|
||||
|
||||
// Load y.
|
||||
if ( incy == 1 )
|
||||
{
|
||||
y0v.v = _mm256_loadu_ps( y + 0*n_elem_per_reg );
|
||||
}
|
||||
else
|
||||
{
|
||||
y0v.f[0] = *(y + 0*incy); y0v.f[1] = *(y + 1*incy);
|
||||
y0v.f[2] = *(y + 2*incy); y0v.f[3] = *(y + 3*incy);
|
||||
y0v.f[4] = *(y + 4*incy); y0v.f[5] = *(y + 5*incy);
|
||||
y0v.f[6] = *(y + 6*incy); y0v.f[7] = *(y + 7*incy);
|
||||
}
|
||||
|
||||
// Apply beta to y and alpha to the accumulated dot product in rho:
|
||||
// y := beta * y + alpha * rho
|
||||
y0v.v = _mm256_mul_ps( betav.v, y0v.v );
|
||||
y0v.v = _mm256_fmadd_ps( alphav.v, rho0v.v, y0v.v );
|
||||
}
|
||||
|
||||
// Store the output.
|
||||
if ( incy == 1 )
|
||||
{
|
||||
_mm256_storeu_ps( (y + 0*n_elem_per_reg), y0v.v );
|
||||
}
|
||||
else
|
||||
{
|
||||
*(y + 0*incy) = y0v.f[0]; *(y + 1*incy) = y0v.f[1];
|
||||
*(y + 2*incy) = y0v.f[2]; *(y + 3*incy) = y0v.f[3];
|
||||
*(y + 4*incy) = y0v.f[4]; *(y + 5*incy) = y0v.f[5];
|
||||
*(y + 6*incy) = y0v.f[6]; *(y + 7*incy) = y0v.f[7];
|
||||
}
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions.
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_ddotxf_zen_int_8
|
||||
(
|
||||
conj_t conjat,
|
||||
conj_t conjx,
|
||||
dim_t m,
|
||||
dim_t b_n,
|
||||
double* restrict alpha,
|
||||
double* restrict a, inc_t inca, inc_t lda,
|
||||
double* restrict x, inc_t incx,
|
||||
double* restrict beta,
|
||||
double* restrict y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const dim_t fuse_fac = 8;
|
||||
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
const dim_t n_iter_unroll = 1;
|
||||
|
||||
dim_t i;
|
||||
dim_t m_viter;
|
||||
dim_t m_left;
|
||||
|
||||
double* restrict a0;
|
||||
double* restrict a1;
|
||||
double* restrict a2;
|
||||
double* restrict a3;
|
||||
double* restrict a4;
|
||||
double* restrict a5;
|
||||
double* restrict a6;
|
||||
double* restrict a7;
|
||||
|
||||
double* restrict x0;
|
||||
|
||||
double rho0, rho1, rho2, rho3;
|
||||
double rho4, rho5, rho6, rho7;
|
||||
|
||||
v4df_t a0v, a1v, a2v, a3v;
|
||||
v4df_t a4v, a5v, a6v, a7v;
|
||||
|
||||
v4df_t x0v;
|
||||
|
||||
v4df_t rho0v, rho1v, rho2v, rho3v;
|
||||
v4df_t rho4v, rho5v, rho6v, rho7v;
|
||||
|
||||
v4df_t alphav;
|
||||
v4df_t betav;
|
||||
v4df_t y0v, y1v;
|
||||
|
||||
// If the b_n dimension is zero, y is empty and there is no computation.
|
||||
if ( bli_zero_dim1( b_n ) ) return;
|
||||
|
||||
// If the m dimension is zero, or if alpha is zero, the computation
|
||||
// simplifies to updating y.
|
||||
if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) )
|
||||
{
|
||||
dscalv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
|
||||
|
||||
f
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
b_n,
|
||||
beta,
|
||||
y, incy,
|
||||
cntx
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// If b_n is not equal to the fusing factor, then perform the entire
|
||||
// operation as a loop over dotxv.
|
||||
if ( b_n != fuse_fac )
|
||||
{
|
||||
ddotxv_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
|
||||
|
||||
for ( i = 0; i < b_n; ++i )
|
||||
{
|
||||
double* a1 = a + (0 )*inca + (i )*lda;
|
||||
double* x1 = x + (0 )*incx;
|
||||
double* psi1 = y + (i )*incy;
|
||||
|
||||
f
|
||||
(
|
||||
conjat,
|
||||
conjx,
|
||||
m,
|
||||
alpha,
|
||||
a1, inca,
|
||||
x1, incx,
|
||||
beta,
|
||||
psi1,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we know that b_n is exactly equal to the fusing factor.
|
||||
// However, m may not be a multiple of the number of elements per vector.
|
||||
|
||||
// Use the unrolling factor and the number of elements per register
|
||||
// to compute the number of vectorized and leftover iterations.
|
||||
m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
// If there is anything that would interfere with our use of contiguous
|
||||
// vector loads/stores, override m_viter and m_left to use scalar code
|
||||
// for all iterations. (NOTE: vector instructions can be used even if
|
||||
// the increment of the output vector, incy, is nonunit.)
|
||||
if ( inca != 1 || incx != 1 )
|
||||
{
|
||||
m_viter = 0;
|
||||
m_left = m;
|
||||
}
|
||||
|
||||
// Set up pointers for x and the b_n columns of A (rows of A^T).
|
||||
x0 = x;
|
||||
a0 = a + 0*lda;
|
||||
a1 = a + 1*lda;
|
||||
a2 = a + 2*lda;
|
||||
a3 = a + 3*lda;
|
||||
a4 = a + 4*lda;
|
||||
a5 = a + 5*lda;
|
||||
a6 = a + 6*lda;
|
||||
a7 = a + 7*lda;
|
||||
|
||||
// Initialize b_n rho vector accumulators to zero.
|
||||
rho0v.v = _mm256_setzero_pd();
|
||||
rho1v.v = _mm256_setzero_pd();
|
||||
rho2v.v = _mm256_setzero_pd();
|
||||
rho3v.v = _mm256_setzero_pd();
|
||||
rho4v.v = _mm256_setzero_pd();
|
||||
rho5v.v = _mm256_setzero_pd();
|
||||
rho6v.v = _mm256_setzero_pd();
|
||||
rho7v.v = _mm256_setzero_pd();
|
||||
|
||||
// If there are vectorized iterations, perform them with vector
|
||||
// instructions.
|
||||
for ( i = 0; i < m_viter; ++i )
|
||||
{
|
||||
// Load the input values.
|
||||
x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg );
|
||||
|
||||
a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
|
||||
a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
|
||||
a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
|
||||
a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
|
||||
a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
|
||||
a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg );
|
||||
a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg );
|
||||
a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg );
|
||||
|
||||
// perform : rho?v += a?v * x0v;
|
||||
rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_fmadd_pd( a1v.v, x0v.v, rho1v.v );
|
||||
rho2v.v = _mm256_fmadd_pd( a2v.v, x0v.v, rho2v.v );
|
||||
rho3v.v = _mm256_fmadd_pd( a3v.v, x0v.v, rho3v.v );
|
||||
rho4v.v = _mm256_fmadd_pd( a4v.v, x0v.v, rho4v.v );
|
||||
rho5v.v = _mm256_fmadd_pd( a5v.v, x0v.v, rho5v.v );
|
||||
rho6v.v = _mm256_fmadd_pd( a6v.v, x0v.v, rho6v.v );
|
||||
rho7v.v = _mm256_fmadd_pd( a7v.v, x0v.v, rho7v.v );
|
||||
|
||||
x0 += n_elem_per_reg * n_iter_unroll;
|
||||
a0 += n_elem_per_reg * n_iter_unroll;
|
||||
a1 += n_elem_per_reg * n_iter_unroll;
|
||||
a2 += n_elem_per_reg * n_iter_unroll;
|
||||
a3 += n_elem_per_reg * n_iter_unroll;
|
||||
a4 += n_elem_per_reg * n_iter_unroll;
|
||||
a5 += n_elem_per_reg * n_iter_unroll;
|
||||
a6 += n_elem_per_reg * n_iter_unroll;
|
||||
a7 += n_elem_per_reg * n_iter_unroll;
|
||||
}
|
||||
|
||||
#if 0
|
||||
rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3];
|
||||
rho1 += rho1v.d[0] + rho1v.d[1] + rho1v.d[2] + rho1v.d[3];
|
||||
rho2 += rho2v.d[0] + rho2v.d[1] + rho2v.d[2] + rho2v.d[3];
|
||||
rho3 += rho3v.d[0] + rho3v.d[1] + rho3v.d[2] + rho3v.d[3];
|
||||
rho4 += rho4v.d[0] + rho4v.d[1] + rho4v.d[2] + rho4v.d[3];
|
||||
rho5 += rho5v.d[0] + rho5v.d[1] + rho5v.d[2] + rho5v.d[3];
|
||||
rho6 += rho6v.d[0] + rho6v.d[1] + rho6v.d[2] + rho6v.d[3];
|
||||
rho7 += rho7v.d[0] + rho7v.d[1] + rho7v.d[2] + rho7v.d[3];
|
||||
#else
|
||||
// Sum the elements of a given rho?v. This computes the sum of
|
||||
// elements within lanes and stores the sum to both elements.
|
||||
rho0v.v = _mm256_hadd_pd( rho0v.v, rho0v.v );
|
||||
rho1v.v = _mm256_hadd_pd( rho1v.v, rho1v.v );
|
||||
rho2v.v = _mm256_hadd_pd( rho2v.v, rho2v.v );
|
||||
rho3v.v = _mm256_hadd_pd( rho3v.v, rho3v.v );
|
||||
rho4v.v = _mm256_hadd_pd( rho4v.v, rho4v.v );
|
||||
rho5v.v = _mm256_hadd_pd( rho5v.v, rho5v.v );
|
||||
rho6v.v = _mm256_hadd_pd( rho6v.v, rho6v.v );
|
||||
rho7v.v = _mm256_hadd_pd( rho7v.v, rho7v.v );
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the m_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// Manually add the results from above to finish the sum.
|
||||
rho0 = rho0v.d[0] + rho0v.d[2];
|
||||
rho1 = rho1v.d[0] + rho1v.d[2];
|
||||
rho2 = rho2v.d[0] + rho2v.d[2];
|
||||
rho3 = rho3v.d[0] + rho3v.d[2];
|
||||
rho4 = rho4v.d[0] + rho4v.d[2];
|
||||
rho5 = rho5v.d[0] + rho5v.d[2];
|
||||
rho6 = rho6v.d[0] + rho6v.d[2];
|
||||
rho7 = rho7v.d[0] + rho7v.d[2];
|
||||
#endif
|
||||
|
||||
// If there are leftover iterations, perform them with scalar code.
|
||||
for ( i = 0; i < m_left ; ++i )
|
||||
{
|
||||
const double x0c = *x0;
|
||||
|
||||
const double a0c = *a0;
|
||||
const double a1c = *a1;
|
||||
const double a2c = *a2;
|
||||
const double a3c = *a3;
|
||||
const double a4c = *a4;
|
||||
const double a5c = *a5;
|
||||
const double a6c = *a6;
|
||||
const double a7c = *a7;
|
||||
|
||||
rho0 += a0c * x0c;
|
||||
rho1 += a1c * x0c;
|
||||
rho2 += a2c * x0c;
|
||||
rho3 += a3c * x0c;
|
||||
rho4 += a4c * x0c;
|
||||
rho5 += a5c * x0c;
|
||||
rho6 += a6c * x0c;
|
||||
rho7 += a7c * x0c;
|
||||
|
||||
x0 += incx;
|
||||
a0 += inca;
|
||||
a1 += inca;
|
||||
a2 += inca;
|
||||
a3 += inca;
|
||||
a4 += inca;
|
||||
a5 += inca;
|
||||
a6 += inca;
|
||||
a7 += inca;
|
||||
}
|
||||
|
||||
// Insert the scalar rho values into a single vector.
|
||||
rho0v.d[0] = rho0;
|
||||
rho0v.d[1] = rho1;
|
||||
rho0v.d[2] = rho2;
|
||||
rho0v.d[3] = rho3;
|
||||
rho1v.d[0] = rho4;
|
||||
rho1v.d[1] = rho5;
|
||||
rho1v.d[2] = rho6;
|
||||
rho1v.d[3] = rho7;
|
||||
|
||||
// Broadcast the alpha scalar.
|
||||
alphav.v = _mm256_broadcast_sd( alpha );
|
||||
|
||||
// We know at this point that alpha is nonzero; however, beta may still
|
||||
// be zero. If beta is indeed zero, we must overwrite y rather than scale
|
||||
// by beta (in case y contains NaN or Inf).
|
||||
if ( PASTEMAC(d,eq0)( *beta ) )
|
||||
{
|
||||
// Apply alpha to the accumulated dot product in rho:
|
||||
// y := alpha * rho
|
||||
y0v.v = _mm256_mul_pd( alphav.v, rho0v.v );
|
||||
y1v.v = _mm256_mul_pd( alphav.v, rho1v.v );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Broadcast the beta scalar.
|
||||
betav.v = _mm256_broadcast_sd( beta );
|
||||
|
||||
// Load y.
|
||||
if ( incy == 1 )
|
||||
{
|
||||
y0v.v = _mm256_loadu_pd( y + 0*n_elem_per_reg );
|
||||
y1v.v = _mm256_loadu_pd( y + 1*n_elem_per_reg );
|
||||
}
|
||||
else
|
||||
{
|
||||
y0v.d[0] = *(y + 0*incy); y0v.d[1] = *(y + 1*incy);
|
||||
y0v.d[2] = *(y + 2*incy); y0v.d[3] = *(y + 3*incy);
|
||||
y1v.d[0] = *(y + 4*incy); y1v.d[1] = *(y + 5*incy);
|
||||
y1v.d[2] = *(y + 6*incy); y1v.d[3] = *(y + 7*incy);
|
||||
}
|
||||
|
||||
// Apply beta to y and alpha to the accumulated dot product in rho:
|
||||
// y := beta * y + alpha * rho
|
||||
y0v.v = _mm256_mul_pd( betav.v, y0v.v );
|
||||
y1v.v = _mm256_mul_pd( betav.v, y1v.v );
|
||||
y0v.v = _mm256_fmadd_pd( alphav.v, rho0v.v, y0v.v );
|
||||
y1v.v = _mm256_fmadd_pd( alphav.v, rho1v.v, y1v.v );
|
||||
}
|
||||
|
||||
if ( incy == 1 )
|
||||
{
|
||||
// Store the output.
|
||||
_mm256_storeu_pd( (y + 0*n_elem_per_reg), y0v.v );
|
||||
_mm256_storeu_pd( (y + 1*n_elem_per_reg), y1v.v );
|
||||
}
|
||||
else
|
||||
{
|
||||
*(y + 0*incy) = y0v.d[0]; *(y + 1*incy) = y0v.d[1];
|
||||
*(y + 2*incy) = y0v.d[2]; *(y + 3*incy) = y0v.d[3];
|
||||
*(y + 4*incy) = y1v.d[0]; *(y + 5*incy) = y1v.d[1];
|
||||
*(y + 6*incy) = y1v.d[2]; *(y + 7*incy) = y1v.d[3];
|
||||
}
|
||||
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions.
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
3737
kernels/zen/3/bli_gemm_small.c
Normal file
3737
kernels/zen/3/bli_gemm_small.c
Normal file
File diff suppressed because it is too large
Load Diff
2673
kernels/zen/3/bli_gemm_zen_asm_d6x8.c
Normal file
2673
kernels/zen/3/bli_gemm_zen_asm_d6x8.c
Normal file
File diff suppressed because it is too large
Load Diff
2261
kernels/zen/3/bli_gemm_zen_asm_d8x6.c
Normal file
2261
kernels/zen/3/bli_gemm_zen_asm_d8x6.c
Normal file
File diff suppressed because it is too large
Load Diff
1504
kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
Normal file
1504
kernels/zen/3/bli_gemmtrsm_l_zen_asm_d6x8.c
Normal file
File diff suppressed because it is too large
Load Diff
1514
kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
Normal file
1514
kernels/zen/3/bli_gemmtrsm_u_zen_asm_d6x8.c
Normal file
File diff suppressed because it is too large
Load Diff
101
kernels/zen/bli_kernels_zen.h
Normal file
101
kernels/zen/bli_kernels_zen.h
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// -- level-1v --
|
||||
|
||||
// amaxv (intrinsics)
|
||||
AMAXV_KER_PROT( float, s, amaxv_zen_int )
|
||||
AMAXV_KER_PROT( double, d, amaxv_zen_int )
|
||||
|
||||
// axpyv (intrinsics)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int )
|
||||
|
||||
// axpyv (intrinsics unrolled x10)
|
||||
AXPYV_KER_PROT( float, s, axpyv_zen_int10 )
|
||||
AXPYV_KER_PROT( double, d, axpyv_zen_int10 )
|
||||
|
||||
// dotv (intrinsics)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int )
|
||||
|
||||
// dotv (intrinsics, unrolled x10)
|
||||
DOTV_KER_PROT( float, s, dotv_zen_int10 )
|
||||
DOTV_KER_PROT( double, d, dotv_zen_int10 )
|
||||
|
||||
// dotxv (intrinsics)
|
||||
DOTXV_KER_PROT( float, s, dotxv_zen_int )
|
||||
DOTXV_KER_PROT( double, d, dotxv_zen_int )
|
||||
|
||||
// scalv (intrinsics)
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int )
|
||||
SCALV_KER_PROT( double, d, scalv_zen_int )
|
||||
|
||||
// scalv (intrinsics unrolled x10)
|
||||
SCALV_KER_PROT( float, s, scalv_zen_int10 )
|
||||
SCALV_KER_PROT( double, d, scalv_zen_int10 )
|
||||
|
||||
// -- level-1f --
|
||||
|
||||
// axpyf (intrinsics)
|
||||
AXPYF_KER_PROT( float, s, axpyf_zen_int_8 )
|
||||
AXPYF_KER_PROT( double, d, axpyf_zen_int_8 )
|
||||
|
||||
// dotxf (intrinsics)
|
||||
DOTXF_KER_PROT( float, s, dotxf_zen_int_8 )
|
||||
DOTXF_KER_PROT( double, d, dotxf_zen_int_8 )
|
||||
|
||||
// -- level-3 --
|
||||
|
||||
// gemm (asm d6x8)
|
||||
GEMM_UKR_PROT( float, s, gemm_zen_asm_6x16 )
|
||||
GEMM_UKR_PROT( double, d, gemm_zen_asm_6x8 )
|
||||
GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_3x8 )
|
||||
GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_3x4 )
|
||||
|
||||
// gemmtrsm_l (asm d6x8)
|
||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_zen_asm_6x16 )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_zen_asm_6x8 )
|
||||
|
||||
// gemmtrsm_u (asm d6x8)
|
||||
GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_zen_asm_6x16 )
|
||||
GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_zen_asm_6x8 )
|
||||
|
||||
|
||||
// gemm (asm d8x6)
|
||||
//GEMM_UKR_PROT( float, s, gemm_zen_asm_16x6 )
|
||||
//GEMM_UKR_PROT( double, d, gemm_zen_asm_8x6 )
|
||||
//GEMM_UKR_PROT( scomplex, c, gemm_zen_asm_8x3 )
|
||||
//GEMM_UKR_PROT( dcomplex, z, gemm_zen_asm_4x3 )
|
||||
|
||||
@@ -56,14 +56,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
ctype_r chi1_i; \
|
||||
ctype_r abs_chi1; \
|
||||
ctype_r abs_chi1_max; \
|
||||
dim_t i_max_l; \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( zero_i, *i_max ); \
|
||||
\
|
||||
/* If the vector length is zero, return early. This directly emulates
|
||||
the behavior of netlib BLAS's i?amax() routines. */ \
|
||||
if ( bli_zero_dim1( n ) ) return; \
|
||||
if ( bli_zero_dim1( n ) ) \
|
||||
{ \
|
||||
PASTEMAC(i,copys)( *zero_i, *i_max ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( *zero_i, i_max_l ); \
|
||||
\
|
||||
/* Initialize the maximum absolute value search candidate with
|
||||
-1, which is guaranteed to be less than all values we will
|
||||
@@ -72,10 +77,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
\
|
||||
if ( incx == 1 ) \
|
||||
{ \
|
||||
ctype* chi1 = x; \
|
||||
\
|
||||
for ( i = 0; i < n; ++i ) \
|
||||
{ \
|
||||
/* Get the real and imaginary components of chi1. */ \
|
||||
PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \
|
||||
PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
|
||||
\
|
||||
/* Replace chi1_r and chi1_i with their absolute values. */ \
|
||||
PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \
|
||||
@@ -94,8 +101,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
*i_max = i; \
|
||||
i_max_l = i; \
|
||||
} \
|
||||
\
|
||||
chi1 += 1; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
@@ -124,10 +133,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
*i_max = i; \
|
||||
i_max_l = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Store the final index to the output variable. */ \
|
||||
PASTEMAC(i,copys)( i_max_l, *i_max ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC2( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -66,7 +66,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
PASTEMAC(ch,scals)( *beta, *rho ); \
|
||||
} \
|
||||
\
|
||||
if ( bli_zero_dim1( n ) ) return; \
|
||||
/* If the vectors are empty or if alpha is zero, return early. */ \
|
||||
if ( bli_zero_dim1( n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
|
||||
\
|
||||
PASTEMAC(ch,set0s)( dotxy ); \
|
||||
\
|
||||
|
||||
@@ -712,6 +712,18 @@ void GENBAINAME(cntx_init)
|
||||
{
|
||||
const bool_t is_pb = FALSE;
|
||||
|
||||
// We MUST set the induced method in the context prior to calling
|
||||
// bli_cntx_l3_ukr_prefers_cols_dt() because that function queries
|
||||
// the induced method. It needs the induced method value in order
|
||||
// to determine whether to evaluate the "prefers column storage"
|
||||
// predicate using the storage preference of the kernel for dt, or
|
||||
// the storage preference of the kernel for the real projection of
|
||||
// dt. Failing to set the induced method here can lead to strange
|
||||
// undefined behavior at runtime if the native complex kernel's
|
||||
// storage preference happens to not equal that of the native real
|
||||
// kernel.
|
||||
bli_cntx_set_method( method, cntx );
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -164,7 +165,9 @@ LDFLAGS += -lgfortran -lm -lpthread -fopenmp
|
||||
#all: blis openblas atlas mkl
|
||||
all: blis openblas mkl
|
||||
|
||||
blis: test_gemv_blis.x \
|
||||
blis: test_dotv_blis.x \
|
||||
test_axpyv_blis.x \
|
||||
test_gemv_blis.x \
|
||||
test_ger_blis.x \
|
||||
test_hemv_blis.x \
|
||||
test_her_blis.x \
|
||||
@@ -179,7 +182,10 @@ blis: test_gemv_blis.x \
|
||||
test_trmm_blis.x \
|
||||
test_trsm_blis.x
|
||||
|
||||
openblas: test_gemv_openblas.x \
|
||||
openblas: \
|
||||
test_dotv_openblas.x \
|
||||
test_axpyv_openblas.x \
|
||||
test_gemv_openblas.x \
|
||||
test_ger_openblas.x \
|
||||
test_hemv_openblas.x \
|
||||
test_her_openblas.x \
|
||||
@@ -194,7 +200,10 @@ openblas: test_gemv_openblas.x \
|
||||
test_trmm_openblas.x \
|
||||
test_trsm_openblas.x
|
||||
|
||||
atlas: test_gemv_atlas.x \
|
||||
atlas: \
|
||||
test_dotv_atlas.x \
|
||||
test_axpyv_atlas.x \
|
||||
test_gemv_atlas.x \
|
||||
test_ger_atlas.x \
|
||||
test_hemv_atlas.x \
|
||||
test_her_atlas.x \
|
||||
@@ -209,7 +218,9 @@ atlas: test_gemv_atlas.x \
|
||||
test_trmm_atlas.x \
|
||||
test_trsm_atlas.x
|
||||
|
||||
mkl: test_gemv_mkl.x \
|
||||
mkl: test_dotv_mkl.x \
|
||||
test_axpyv_mkl.x \
|
||||
test_gemv_mkl.x \
|
||||
test_ger_mkl.x \
|
||||
test_hemv_mkl.x \
|
||||
test_her_mkl.x \
|
||||
@@ -224,7 +235,9 @@ mkl: test_gemv_mkl.x \
|
||||
test_trmm_mkl.x \
|
||||
test_trsm_mkl.x
|
||||
|
||||
essl: test_gemv_essl.x \
|
||||
essl: test_dotv_essl.x \
|
||||
test_axpyv_essl.x \
|
||||
test_gemv_essl.x \
|
||||
test_ger_essl.x \
|
||||
test_hemv_essl.x \
|
||||
test_her_essl.x \
|
||||
@@ -239,7 +252,9 @@ essl: test_gemv_essl.x \
|
||||
test_trmm_essl.x \
|
||||
test_trsm_essl.x
|
||||
|
||||
mac: test_gemv_mac.x \
|
||||
mac: test_dotv_mac.x \
|
||||
test_axpyv_mac.x \
|
||||
test_gemv_mac.x \
|
||||
test_ger_mac.x \
|
||||
test_hemv_mac.x \
|
||||
test_her_mac.x \
|
||||
|
||||
192
test/test_axpyv.c
Normal file
192
test/test_axpyv.c
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
// n alpha x incx y incy
|
||||
//void daxpyv_( int*, double*, double*, int*, double*, int* );
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t x, y;
|
||||
obj_t y_save;
|
||||
obj_t alpha;
|
||||
dim_t n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input;
|
||||
num_t dt_x, dt_y;
|
||||
num_t dt_alpha;
|
||||
int r, n_repeats;
|
||||
num_t dt;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
bli_init();
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 40;
|
||||
p_end = 4000;
|
||||
p_inc = 40;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = 15;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dt = BLIS_FLOAT;
|
||||
//dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
|
||||
dt_x = dt_y = dt_alpha = dt;
|
||||
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha );
|
||||
|
||||
bli_obj_create( dt_x, n, 1, 0, 0, &x );
|
||||
bli_obj_create( dt_y, n, 1, 0, 0, &y );
|
||||
bli_obj_create( dt_y, n, 1, 0, 0, &y_save );
|
||||
|
||||
bli_randm( &x );
|
||||
bli_randm( &y );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
|
||||
bli_copym( &y, &y_save );
|
||||
|
||||
dtime_save = 1.0e9;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &y_save, &y );
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "alpha", &alpha, "%4.1f", "" );
|
||||
bli_printm( "x", &x, "%4.1f", "" );
|
||||
bli_printm( "y", &y, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_axpyv( &alpha,
|
||||
&x,
|
||||
&y );
|
||||
#else
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int nn = bli_obj_length( x );
|
||||
f77_int incx = bli_obj_vector_inc( x );
|
||||
f77_int incy = bli_obj_vector_inc( y );
|
||||
float* alphap = bli_obj_buffer( alpha );
|
||||
float* xp = bli_obj_buffer( x );
|
||||
float* yp = bli_obj_buffer( y );
|
||||
|
||||
saxpy_( &nn,
|
||||
alphap,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
|
||||
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
|
||||
f77_int nn = bli_obj_length( x );
|
||||
f77_int incx = bli_obj_vector_inc( x );
|
||||
f77_int incy = bli_obj_vector_inc( y );
|
||||
double* alphap = bli_obj_buffer( alpha );
|
||||
double* xp = bli_obj_buffer( x );
|
||||
double* yp = bli_obj_buffer( y );
|
||||
|
||||
daxpy_( &nn,
|
||||
alphap,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "y after", &y, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_axpyv_blis" );
|
||||
#else
|
||||
printf( "data_axpyv_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %10.3e %6.3f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )n, dtime_save, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
|
||||
bli_obj_free( &x );
|
||||
bli_obj_free( &y );
|
||||
bli_obj_free( &y_save );
|
||||
}
|
||||
|
||||
bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
174
test/test_dotv.c
Normal file
174
test/test_dotv.c
Normal file
@@ -0,0 +1,174 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
// res n x incx y incy
|
||||
//double res = ddotv_( int*, double*, int*, double*, int* );
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t x, y;
|
||||
obj_t res;
|
||||
dim_t n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input;
|
||||
num_t dt_x, dt_y, dt_res;
|
||||
int r, n_repeats;
|
||||
num_t dt;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
bli_init();
|
||||
|
||||
n_repeats = 10;
|
||||
|
||||
#ifndef PRINT
|
||||
p_begin = 40;
|
||||
p_end = 4000;
|
||||
p_inc = 40;
|
||||
|
||||
n_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
n_input = 16;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
dt = BLIS_FLOAT;
|
||||
//dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
dt_x = dt_y = dt_res = dt;
|
||||
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
bli_obj_create( dt_x, n, 1, 0, 0, &x );
|
||||
bli_obj_create( dt_y, n, 1, 0, 0, &y );
|
||||
bli_obj_create( dt_res, 1, 1, 0, 0, &res );
|
||||
|
||||
bli_randm( &x );
|
||||
bli_randm( &y );
|
||||
|
||||
dtime_save = 1.0e9;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "x", &x, "%4.1f", "" );
|
||||
bli_printm( "y", &y, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
bli_dotv( &x,
|
||||
&y,
|
||||
&res);
|
||||
#else
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int nn = bli_obj_length( x );
|
||||
f77_int incx = bli_obj_vector_inc( x );
|
||||
f77_int incy = bli_obj_vector_inc( y );
|
||||
float* xp = bli_obj_buffer( x );
|
||||
float* yp = bli_obj_buffer( y );
|
||||
float* resp = bli_obj_buffer( res );
|
||||
|
||||
*resp = sdot_( &nn,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
|
||||
f77_int nn = bli_obj_length( x );
|
||||
f77_int incx = bli_obj_vector_inc( x );
|
||||
f77_int incy = bli_obj_vector_inc( y );
|
||||
double* xp = bli_obj_buffer( x );
|
||||
double* yp = bli_obj_buffer( y );
|
||||
double* resp = bli_obj_buffer( res );
|
||||
|
||||
*resp = ddot_( &nn,
|
||||
xp, &incx,
|
||||
yp, &incy );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "res after", &res, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_dotv_blis" );
|
||||
#else
|
||||
printf( "data_dotv_%s", BLAS );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %10.3e %6.3f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )n, dtime_save, gflops );
|
||||
|
||||
bli_obj_free( &x );
|
||||
bli_obj_free( &y );
|
||||
bli_obj_free( &res );
|
||||
}
|
||||
|
||||
bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -351,14 +351,19 @@ void PASTEMAC(ch,varname) \
|
||||
ctype_r chi1_i; \
|
||||
ctype_r abs_chi1; \
|
||||
ctype_r abs_chi1_max; \
|
||||
dim_t i_max_l; \
|
||||
dim_t i; \
|
||||
\
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( zero_i, *i_max ); \
|
||||
\
|
||||
/* If the vector length is zero, return early. This directly emulates
|
||||
the behavior of netlib BLAS's i?amax() routines. */ \
|
||||
if ( bli_zero_dim1( n ) ) return; \
|
||||
if ( bli_zero_dim1( n ) ) \
|
||||
{ \
|
||||
PASTEMAC(i,copys)( *zero_i, *i_max ); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
/* Initialize the index of the maximum absolute value to zero. */ \
|
||||
PASTEMAC(i,copys)( *zero_i, i_max_l ); \
|
||||
\
|
||||
/* Initialize the maximum absolute value search candidate with
|
||||
-1, which is guaranteed to be less than all values we will
|
||||
@@ -390,10 +395,13 @@ void PASTEMAC(ch,varname) \
|
||||
if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
|
||||
{ \
|
||||
abs_chi1_max = abs_chi1; \
|
||||
*i_max = i; \
|
||||
i_max_l = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Store the final index to the output variable. */ \
|
||||
PASTEMAC(i,copys)( i_max_l, *i_max ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC0( amaxv_test )
|
||||
|
||||
@@ -40,10 +40,10 @@
|
||||
static char* op_str = "dotxf";
|
||||
static char* o_types = "mvv"; // A x y
|
||||
static char* p_types = "cc"; // conjat conjx
|
||||
static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s
|
||||
static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 5e-04, 5e-05 }, // warn, pass for s
|
||||
{ 1e-04, 1e-05 }, // warn, pass for c
|
||||
{ 1e-13, 1e-14 }, // warn, pass for d
|
||||
{ 1e-13, 1e-14 } }; // warn, pass for z
|
||||
{ 1e-12, 1e-13 }, // warn, pass for d
|
||||
{ 1e-12, 1e-13 } }; // warn, pass for z
|
||||
|
||||
// Local prototypes.
|
||||
void libblis_test_dotxf_deps
|
||||
|
||||
Reference in New Issue
Block a user