mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Removed support for duplication.
Details: - Removed support for duplication from the gemmtrsm/trsm micro-kernels and all framework code. - Updated test suite modules according to above changes.
This commit is contained in:
@@ -152,34 +152,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -265,10 +237,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_8x8.h"
|
||||
|
||||
@@ -146,30 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -262,10 +238,6 @@
|
||||
//#include "bli_trsm_l_ref_4x4.h"
|
||||
//#include "bli_trsm_u_ref_4x4.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 1
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -261,10 +233,6 @@
|
||||
|
||||
#include "bli_gemm_opt_d4x4.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_d4x4
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 8
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -259,10 +231,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_opt_30x8.h"
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -259,10 +231,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#include "bli_gemm_4x6.h"
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -259,10 +231,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
//#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -259,10 +231,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -261,10 +233,6 @@
|
||||
|
||||
#include "bli_gemm_opt_8x4_ref_u4_nodupl_avx1.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_8x4_ref_u4_nodupl_avx1
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -265,10 +237,6 @@
|
||||
#include "bli_gemmtrsm_l_opt_mxn.h"
|
||||
#include "bli_gemmtrsm_u_opt_mxn.h"
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_opt_mxn
|
||||
|
||||
@@ -41,8 +41,7 @@ void bli_sgemmtrsm_l_opt_mxn(
|
||||
float* restrict alpha,
|
||||
float* restrict a10,
|
||||
float* restrict a11,
|
||||
float* restrict bd01,
|
||||
float* restrict bd11,
|
||||
float* restrict b01,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_l_opt_mxn(
|
||||
bli_sgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
b01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_l_opt_mxn(
|
||||
|
||||
bli_strsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -77,8 +75,7 @@ void bli_dgemmtrsm_l_opt_mxn(
|
||||
double* restrict alpha,
|
||||
double* restrict a10,
|
||||
double* restrict a11,
|
||||
double* restrict bd01,
|
||||
double* restrict bd11,
|
||||
double* restrict b01,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
@@ -171,19 +168,6 @@ void bli_dgemmtrsm_l_opt_mxn(
|
||||
|
||||
k MR
|
||||
|
||||
Thus, with duplication enabled, the operation takes the form of:
|
||||
|
||||
b11 = alpha * b11 - a10 * bd01;
|
||||
b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11;
|
||||
|
||||
And if duplication is disabled, the operation reduces to:
|
||||
|
||||
b11 = alpha * b11 - a10 * b01; (Note: Here, b01 == bd01.)
|
||||
b11 = inv(a11) * b11;
|
||||
c11 = b11;
|
||||
|
||||
A note on optimization:
|
||||
- This implementation simply calls the gemm micro-kernel and then the
|
||||
trsm micro-kernel. Let's assume that the gemm micro-kernel has already
|
||||
@@ -208,24 +192,20 @@ void bli_dgemmtrsm_l_opt_mxn(
|
||||
|
||||
double* restrict minus_one = bli_dm1;
|
||||
|
||||
/* Reminder: if duplication is disabled, then bd01 == b01, bd11 == b11. */
|
||||
|
||||
/* b11 = alpha * b11 - a10 * bd01; */
|
||||
/* b11 = alpha * b11 - a10 * b01; */
|
||||
bli_dgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
b01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */
|
||||
bli_dtrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -236,8 +216,7 @@ void bli_cgemmtrsm_l_opt_mxn(
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a10,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd01,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b01,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
@@ -253,7 +232,7 @@ void bli_cgemmtrsm_l_opt_mxn(
|
||||
bli_cgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
b01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -261,7 +240,6 @@ void bli_cgemmtrsm_l_opt_mxn(
|
||||
|
||||
bli_ctrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -272,8 +250,7 @@ void bli_zgemmtrsm_l_opt_mxn(
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a10,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd01,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
@@ -289,7 +266,7 @@ void bli_zgemmtrsm_l_opt_mxn(
|
||||
bli_zgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a10,
|
||||
bd01,
|
||||
b01,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -297,7 +274,6 @@ void bli_zgemmtrsm_l_opt_mxn(
|
||||
|
||||
bli_ztrsm_l_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b01, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -41,8 +41,7 @@ void bli_sgemmtrsm_u_opt_mxn(
|
||||
float* restrict alpha,
|
||||
float* restrict a12,
|
||||
float* restrict a11,
|
||||
float* restrict bd21,
|
||||
float* restrict bd11,
|
||||
float* restrict b21,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
@@ -58,7 +57,7 @@ void bli_sgemmtrsm_u_opt_mxn(
|
||||
bli_sgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
b21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -66,7 +65,6 @@ void bli_sgemmtrsm_u_opt_mxn(
|
||||
|
||||
bli_strsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -77,8 +75,7 @@ void bli_dgemmtrsm_u_opt_mxn(
|
||||
double* restrict alpha,
|
||||
double* restrict a12,
|
||||
double* restrict a11,
|
||||
double* restrict bd21,
|
||||
double* restrict bd11,
|
||||
double* restrict b21,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
@@ -207,24 +204,20 @@ void bli_dgemmtrsm_u_opt_mxn(
|
||||
|
||||
double* restrict minus_one = bli_dm1;
|
||||
|
||||
/* Reminder: if duplication is disabled, then bd21 == b21, bd11 == b11. */
|
||||
|
||||
/* b11 = alpha * b11 - a12 * bd21; */
|
||||
/* b11 = alpha * b11 - a12 * b21; */
|
||||
bli_dgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
b21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
b_next );
|
||||
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */
|
||||
bli_dtrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -235,8 +228,7 @@ void bli_cgemmtrsm_u_opt_mxn(
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a12,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd21,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b21,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
@@ -252,7 +244,7 @@ void bli_cgemmtrsm_u_opt_mxn(
|
||||
bli_cgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
b21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -260,7 +252,6 @@ void bli_cgemmtrsm_u_opt_mxn(
|
||||
|
||||
bli_ctrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -271,8 +262,7 @@ void bli_zgemmtrsm_u_opt_mxn(
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a12,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd21,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b21,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
@@ -288,7 +278,7 @@ void bli_zgemmtrsm_u_opt_mxn(
|
||||
bli_zgemm_opt_mxn( k,
|
||||
minus_one,
|
||||
a12,
|
||||
bd21,
|
||||
b21,
|
||||
alpha,
|
||||
b11, rs_b, cs_b,
|
||||
a_next,
|
||||
@@ -296,7 +286,6 @@ void bli_zgemmtrsm_u_opt_mxn(
|
||||
|
||||
bli_ztrsm_u_opt_mxn( a11,
|
||||
b11,
|
||||
bd11,
|
||||
c11, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b21, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -39,14 +39,12 @@
|
||||
void bli_strsm_l_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict bd,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -55,7 +53,6 @@ void bli_strsm_l_opt_mxn(
|
||||
void bli_dtrsm_l_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict bd,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
@@ -76,11 +73,6 @@ void bli_dtrsm_l_opt_mxn(
|
||||
where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is
|
||||
MR x NR.
|
||||
|
||||
NOTE: Here, this trsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- a11: The address of A11, which is the MR x MR lower triangular block
|
||||
@@ -89,8 +81,6 @@ void bli_dtrsm_l_opt_mxn(
|
||||
been inverted and the strictly upper triangle contains zeros.
|
||||
- b11: The address of B11, which is the MR x NR subpartition of the
|
||||
current packed (row-stored) micro-panel of B.
|
||||
- bd11: The address of the duplicated copy of B11. If duplication is
|
||||
disabled, then bd11 == b11.
|
||||
- c11: The address of C11, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C11 corresponds to the elements that exist in
|
||||
@@ -110,12 +100,6 @@ void bli_dtrsm_l_opt_mxn(
|
||||
- Note that the diagonal of the triangular matrix A11 contains the INVERSE
|
||||
of those elements. This is done during packing so that we can avoid
|
||||
expensive division instructions within this micro-kernel.
|
||||
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
|
||||
then the result must be written to three places: the sub-block within the
|
||||
duplicated copy of the current micro-panel of B, the sub-block within the
|
||||
current packed micro-panel of B, and the sub-block of the output matrix C.
|
||||
When duplication is not used, the micro-kernel should update only the
|
||||
latter two locations.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
@@ -191,14 +175,12 @@ void bli_dtrsm_l_opt_mxn(
|
||||
void bli_ctrsm_l_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict bd,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -207,14 +189,12 @@ void bli_ctrsm_l_opt_mxn(
|
||||
void bli_ztrsm_l_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict bd,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_l_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -39,14 +39,12 @@
|
||||
void bli_strsm_u_opt_mxn(
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict bd,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_strsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -55,7 +53,6 @@ void bli_strsm_u_opt_mxn(
|
||||
void bli_dtrsm_u_opt_mxn(
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict bd,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
@@ -76,11 +73,6 @@ void bli_dtrsm_u_opt_mxn(
|
||||
where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is
|
||||
MR x NR.
|
||||
|
||||
NOTE: Here, this trsm micro-kernel supports element "duplication", a
|
||||
feature that is enabled or disabled in bli_kernel.h. Duplication factors
|
||||
are also defined in the aforementioned header. Duplication is NOT
|
||||
commonly used and most developers may assume it is disabled.
|
||||
|
||||
Parameters:
|
||||
|
||||
- a11: The address of A11, which is the MR x MR upper triangular block
|
||||
@@ -89,8 +81,6 @@ void bli_dtrsm_u_opt_mxn(
|
||||
been inverted and the strictly lower triangle contains zeros.
|
||||
- b11: The address of B11, which is the MR x NR subpartition of the
|
||||
current packed (row-stored) micro-panel of B.
|
||||
- bd11: The address of the duplicated copy of B11. If duplication is
|
||||
disabled, then bd11 == b11.
|
||||
- c11: The address of C11, which is the MR x NR block of the output
|
||||
matrix (ie: the matrix provided by the user to the highest-level
|
||||
trsm API call). C11 corresponds to the elements that exist in
|
||||
@@ -110,12 +100,6 @@ void bli_dtrsm_u_opt_mxn(
|
||||
- Note that the diagonal of the triangular matrix A11 contains the INVERSE
|
||||
of those elements. This is done during packing so that we can avoid
|
||||
expensive division instructions within this micro-kernel.
|
||||
- This micro-kernel assumes duplication is NOT enabled. If it IS enabled,
|
||||
then the result must be written to three places: the sub-block within the
|
||||
duplicated copy of the current micro-panel of B, the sub-block within the
|
||||
current packed micro-panel of B, and the sub-block of the output matrix C.
|
||||
When duplication is not used, the micro-kernel should update only the
|
||||
latter two locations.
|
||||
|
||||
For more info, please refer to the BLIS website and/or contact the
|
||||
blis-devel mailing list.
|
||||
@@ -191,14 +175,12 @@ void bli_dtrsm_u_opt_mxn(
|
||||
void bli_ctrsm_u_opt_mxn(
|
||||
scomplex* restrict a,
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict bd,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ctrsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -207,14 +189,12 @@ void bli_ctrsm_u_opt_mxn(
|
||||
void bli_ztrsm_u_opt_mxn(
|
||||
dcomplex* restrict a,
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict bd,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
bli_ztrsm_u_ref_mxn( a,
|
||||
b,
|
||||
bd,
|
||||
c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_axpyd( obj_t* alpha,
|
||||
dt_x = bli_obj_datatype( *x );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
bli_axpyd_unb_var1( &alpha_local,
|
||||
x,
|
||||
|
||||
@@ -53,10 +53,10 @@ void bli_scal2d( obj_t* beta,
|
||||
dt_x = bli_obj_datatype( *x );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
bli_scal2d_unb_var1( &beta_local,
|
||||
x,
|
||||
|
||||
@@ -52,10 +52,10 @@ void bli_scald( obj_t* beta,
|
||||
dt_x = bli_obj_datatype( *x );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
bli_scald_unb_var1( &beta_local,
|
||||
x );
|
||||
|
||||
@@ -52,10 +52,10 @@ void bli_setd( obj_t* beta,
|
||||
dt_x = bli_obj_datatype( *x );
|
||||
|
||||
// Create an object to hold a copy-cast of alpha.
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
bli_obj_init_scalar_copy_of( dt_x,
|
||||
BLIS_NO_CONJUGATE,
|
||||
beta,
|
||||
&beta_local );
|
||||
|
||||
bli_setd_unb_var1( &beta_local,
|
||||
x );
|
||||
|
||||
@@ -143,13 +143,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -160,8 +153,6 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
@@ -176,7 +167,6 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
@@ -215,9 +205,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_nr = k * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
@@ -229,12 +216,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -243,11 +224,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -274,7 +250,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -285,7 +261,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -306,7 +282,7 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, b1, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
|
||||
@@ -143,12 +143,11 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
/* Temporary buffer for incremental packing of B. */ \
|
||||
ctype bp[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,nifac) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
@@ -226,12 +225,9 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* The current packed micro-panel of B will always be stored in bd. */ \
|
||||
bp = bd; \
|
||||
\
|
||||
/* Since we pack micro-panels of B incrementall, one at a time, the
|
||||
/* Since we pack micro-panels of B incrementaly, one at a time, the
|
||||
address of the next micro-panel of B remains constant. */ \
|
||||
b2 = bd; \
|
||||
b2 = bp; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
|
||||
@@ -152,13 +152,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
guint_t t_id = omp_get_thread_num(); \
|
||||
guint_t n_threads = omp_get_num_threads(); \
|
||||
\
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
@@ -170,8 +163,6 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
@@ -186,7 +177,6 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
@@ -217,9 +207,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_nr = k * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
@@ -231,12 +218,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = t_id; j < n_iter; j += n_threads ) \
|
||||
@@ -246,11 +227,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -272,7 +248,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -295,7 +271,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -318,11 +294,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* If duplication is needed, copy the n_left (+ padding) columns
|
||||
of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -342,7 +313,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -368,7 +339,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -384,7 +355,7 @@ void PASTEMAC(ch,varname)( \
|
||||
} /* end omp parallel */ \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, bp, NR, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
|
||||
@@ -1,377 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
|
||||
|
||||
void bli_gemm_ker_var2( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
dim_t m = bli_obj_length( *c );
|
||||
dim_t n = bli_obj_width( *c );
|
||||
dim_t k = bli_obj_width( *a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( *a );
|
||||
inc_t rs_a = bli_obj_row_stride( *a );
|
||||
inc_t cs_a = bli_obj_col_stride( *a );
|
||||
inc_t ps_a = bli_obj_panel_stride( *a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
inc_t rs_b = bli_obj_row_stride( *b );
|
||||
inc_t cs_b = bli_obj_col_stride( *b );
|
||||
inc_t ps_b = bli_obj_panel_stride( *b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
num_t dt_alpha;
|
||||
void* buf_alpha;
|
||||
|
||||
num_t dt_beta;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
/*
|
||||
// Handle the special case where c and a are complex and b is real.
|
||||
// Note that this is the ONLY case allowed by the inner kernel whereby
|
||||
// the datatypes of a and b differ. In this situation, the execution
|
||||
// datatype is real, so we need to inflate (by a factor of two):
|
||||
// - the m dimension,
|
||||
// - the column stride of c,
|
||||
// - the column stride (ie: the panel length) of a, and
|
||||
// - the panel stride of a.
|
||||
if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) )
|
||||
{
|
||||
m *= 2;
|
||||
cs_c *= 2;
|
||||
cs_a *= 2;
|
||||
ps_a *= 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// If alpha is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the alpha object and extract the buffer at the alpha offset.
|
||||
bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha );
|
||||
|
||||
// If beta is a scalar constant, use dt_exec to extract the address of the
|
||||
// corresponding constant value; otherwise, use the datatype encoded
|
||||
// within the beta object and extract the buffer at the beta offset.
|
||||
bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a, ps_a,
|
||||
buf_b, rs_b, cs_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, ukrname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,nr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC(ch,mr); \
|
||||
\
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == GEMM_MR
|
||||
ps_a == stride to next row panel of A
|
||||
rs_b == GEMM_NR
|
||||
cs_b == 1
|
||||
ps_b == stride to next column panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_nr = k * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( j == n_iter - 1 && n_left == 0 ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom edge handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( j == n_iter - 1 && n_left == 0 ) \
|
||||
b2 = b_cast; \
|
||||
\
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_left, NR, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
if ( n_left ) \
|
||||
{ \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* If duplication is needed, copy the n_left (+ padding) columns
|
||||
of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Right edge loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( i == m_iter - 1 && m_left == 0 ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the right edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( MR, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
/* Bottom-right corner handling. */ \
|
||||
if ( m_left ) \
|
||||
{ \
|
||||
/* Compute the address of the next panel of A. */ \
|
||||
a2 = a_cast; \
|
||||
b2 = b_cast; \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
\
|
||||
/* Scale the bottom-right corner of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: bd", k, NR*NDUP, bp, NR*NDUP, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ker_var2, GEMM_UKERNEL )
|
||||
|
||||
@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const bool_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_nr = k * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
|
||||
@@ -147,13 +147,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -164,8 +157,6 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Alias some constants to shorter names. */ \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const bool_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
@@ -181,7 +172,6 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t k_nr; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
@@ -245,9 +235,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_nr = k * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
@@ -259,12 +246,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -273,11 +254,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,7 +289,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -334,7 +310,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -345,7 +321,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
|
||||
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1011; \
|
||||
dim_t off_a1011; \
|
||||
dim_t i, j; \
|
||||
@@ -232,10 +222,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1011 = bli_min( k, diagoffa + m ); \
|
||||
k_nr = k_a1011 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * PACKMR; \
|
||||
@@ -247,12 +233,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -261,11 +241,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -285,11 +260,11 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
bp. */ \
|
||||
b1. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bli_min( k, diagoffa_i + MR ); \
|
||||
\
|
||||
bp_i = bp + off_a1011 * NR * NDUP; \
|
||||
b1_i = b1 + off_a1011 * NR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * PACKMR; \
|
||||
@@ -308,7 +283,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp_i, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -324,7 +299,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp_i, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -356,7 +331,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -367,7 +342,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
|
||||
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -148,8 +141,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t MR = PASTEMAC(ch,mr); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
@@ -162,7 +153,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict bp_i; \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
@@ -171,7 +162,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1112; \
|
||||
dim_t off_a1112; \
|
||||
dim_t i, j; \
|
||||
@@ -240,10 +230,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1112 = k; \
|
||||
k_nr = k_a1112 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * PACKMR; \
|
||||
@@ -255,12 +241,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -269,11 +249,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -293,11 +268,11 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
bp. */ \
|
||||
b1. */ \
|
||||
off_a1112 = bli_max( diagoffa_i, 0 ); \
|
||||
k_a1112 = k - off_a1112; \
|
||||
\
|
||||
bp_i = bp + off_a1112 * NR * NDUP; \
|
||||
b1_i = b1 + off_a1112 * NR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1112 * PACKMR; \
|
||||
@@ -316,7 +291,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp_i, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -332,7 +307,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp_i, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -364,7 +339,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -375,7 +350,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -397,7 +372,7 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm_lu_ker_var2, GEMM_UKERNEL )
|
||||
|
||||
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_b1121; \
|
||||
dim_t off_b1121; \
|
||||
dim_t i, j; \
|
||||
@@ -252,12 +242,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
|
||||
in A. Then compute the length of that panel. */ \
|
||||
off_b1121 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b1121; \
|
||||
k_nr = k_b1121 * NR; \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm_rl_ker_var2, GEMM_UKERNEL )
|
||||
|
||||
@@ -130,13 +130,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -149,8 +142,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
@@ -172,7 +163,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_b0111; \
|
||||
dim_t off_b0111; \
|
||||
dim_t i, j; \
|
||||
@@ -253,12 +243,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -273,12 +257,6 @@ void PASTEMAC(ch,varname)( \
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b0111 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
k_nr = k_b0111 * NR; \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,7 +291,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -329,7 +307,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
bp, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -368,7 +346,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -379,7 +357,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,ukrname)( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -400,7 +378,7 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, bp_i, NR, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm_ru_ker_var2, GEMM_UKERNEL )
|
||||
|
||||
@@ -140,12 +140,6 @@ void bli_trsm( side_t side,
|
||||
alpha,
|
||||
&alpha_local );
|
||||
|
||||
//
|
||||
// NOTE: we need to disable the use of the right-hand side control tree
|
||||
// if duplication is enabled since the trsm_r macrokernels do not support
|
||||
// duplication.
|
||||
//
|
||||
|
||||
// Choose the control tree.
|
||||
if ( bli_is_left( side ) ) cntl = trsm_l_cntl;
|
||||
else cntl = trsm_r_cntl;
|
||||
|
||||
@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bp01; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict b01; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a10; \
|
||||
@@ -237,10 +226,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1011 = bli_min( k, diagoffa + m ); \
|
||||
k_nr = k_a1011 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * PACKMR; \
|
||||
@@ -252,12 +237,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -266,11 +245,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1 + (0 )*rstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -298,16 +272,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the addresses of the panel A10 and the triangular
|
||||
block A11. */ \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * PACKMR; \
|
||||
a10 = a1; \
|
||||
a11 = a1 + k_a10 * PACKMR; \
|
||||
\
|
||||
/* Now compute the corresponding addresses in Bd. */ \
|
||||
bp01 = bp + off_a10 * NR * NDUP; \
|
||||
bp11 = bp + off_a11 * NR * NDUP; \
|
||||
\
|
||||
/* Index into b1 to locate the MR x NR block of b1 that will
|
||||
be updated by the trsm subproblem. */ \
|
||||
b11 = b1 + off_a11 * PACKNR; \
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b01 = b1 + off_a10 * PACKNR; \
|
||||
b11 = b1 + off_a11 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * PACKMR; \
|
||||
@@ -327,8 +298,7 @@ void PASTEMAC(ch,varname)( \
|
||||
alpha_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
bp01, \
|
||||
bp11, \
|
||||
b01, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -340,8 +310,7 @@ void PASTEMAC(ch,varname)( \
|
||||
alpha_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
bp01, \
|
||||
bp11, \
|
||||
b01, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -373,7 +342,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
alpha_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
|
||||
@@ -119,13 +119,6 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
/* Temporary buffer for duplicating elements of B. */ \
|
||||
ctype bd[ PASTEMAC(ch,maxkc) * \
|
||||
PASTEMAC(ch,packnr) * \
|
||||
PASTEMAC(ch,ndup) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype* restrict bp; \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. */ \
|
||||
ctype ct[ PASTEMAC(ch,mr) * \
|
||||
PASTEMAC(ch,nr) ] \
|
||||
@@ -138,8 +131,6 @@ void PASTEMAC(ch,varname)( \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKMR = PASTEMAC(ch,packmr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const bool_t DUPB = NDUP != 1; \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
@@ -151,11 +142,10 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a12; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict bp21; \
|
||||
ctype* restrict bp11; \
|
||||
ctype* restrict b21; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a2; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
@@ -164,7 +154,6 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_nr; \
|
||||
dim_t k_a1112; \
|
||||
dim_t k_a11; \
|
||||
dim_t k_a12; \
|
||||
@@ -246,10 +235,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Compute the number of elements in B to duplicate per iteration. */ \
|
||||
k_a1112 = k; \
|
||||
k_nr = k_a1112 * NR; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = k * PACKMR; \
|
||||
@@ -261,12 +246,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* If the micro-kernel needs elements of B duplicated, set bp to
|
||||
point to the duplication buffer. If no duplication is called for,
|
||||
bp will be set to the current column panel of B for each iteration
|
||||
of the outer loop below. */ \
|
||||
if ( DUPB ) bp = bd; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -275,11 +254,6 @@ void PASTEMAC(ch,varname)( \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* If duplication is needed, copy the current iteration's NR
|
||||
columns of B to a local buffer with each value duplicated. */ \
|
||||
if ( DUPB ) PASTEMAC(ch,dupl)( k_nr, b1, bp ); \
|
||||
else bp = b1; \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -309,16 +283,13 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Compute the addresses of the triangular block A11 and the
|
||||
panel A12. */ \
|
||||
a11 = a1; \
|
||||
a12 = a1 + k_a11 * PACKMR; \
|
||||
a11 = a1; \
|
||||
a12 = a1 + k_a11 * PACKMR; \
|
||||
\
|
||||
/* Now compute the corresponding addresses in Bd. */ \
|
||||
bp11 = bp + off_a11 * NR * NDUP; \
|
||||
bp21 = bp + off_a12 * NR * NDUP; \
|
||||
\
|
||||
/* Index into b1 to locate the MR x NR block of b1 that will be
|
||||
updated by the trsm subproblem. */ \
|
||||
b11 = b1 + off_a11 * PACKNR; \
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b11 = b1 + off_a11 * PACKNR; \
|
||||
b21 = b1 + off_a12 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1112 * PACKMR; \
|
||||
@@ -338,8 +309,7 @@ void PASTEMAC(ch,varname)( \
|
||||
alpha_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
bp21, \
|
||||
bp11, \
|
||||
b21, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -351,8 +321,7 @@ void PASTEMAC(ch,varname)( \
|
||||
alpha_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
bp21, \
|
||||
bp11, \
|
||||
b21, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -384,7 +353,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
alpha_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
a2, b2 ); \
|
||||
@@ -395,7 +364,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
bp, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
a2, b2 ); \
|
||||
@@ -433,7 +402,7 @@ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: bp11 after (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
|
||||
*/ \
|
||||
|
||||
@@ -314,7 +314,6 @@ void PASTEMAC(ch,varname)( \
|
||||
b11, \
|
||||
a12, \
|
||||
a11, \
|
||||
a11, \
|
||||
c11, cs_c, rs_c, \
|
||||
b2, a2 ); \
|
||||
} \
|
||||
@@ -327,7 +326,6 @@ void PASTEMAC(ch,varname)( \
|
||||
b11, \
|
||||
a12, \
|
||||
a11, \
|
||||
a11, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
b2, a2 ); \
|
||||
\
|
||||
|
||||
@@ -308,7 +308,6 @@ void PASTEMAC(ch,varname)( \
|
||||
b11, \
|
||||
a10, \
|
||||
a11, \
|
||||
a11, \
|
||||
c11, cs_c, rs_c, \
|
||||
b2, a2 ); \
|
||||
} \
|
||||
@@ -321,7 +320,6 @@ void PASTEMAC(ch,varname)( \
|
||||
b11, \
|
||||
a10, \
|
||||
a11, \
|
||||
a11, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
b2, a2 ); \
|
||||
\
|
||||
@@ -390,18 +388,6 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 += k_b0111 * PACKNR; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ru_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC2( trsm_ru_ker_var2, GEMMTRSM_L_UKERNEL, GEMM_UKERNEL )
|
||||
|
||||
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b01, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
@@ -56,22 +55,20 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
\
|
||||
/* b11 = alpha * b11 - a10 * bd01; */ \
|
||||
/* b11 = alpha * b11 - a10 * b01; */ \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a10, \
|
||||
bd01, \
|
||||
b01, \
|
||||
alpha, \
|
||||
b11, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */ \
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bd11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
|
||||
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b01, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b21, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
@@ -60,18 +59,16 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
a12, \
|
||||
bd21, \
|
||||
b21, \
|
||||
alpha, \
|
||||
b11, rs_b, cs_b, \
|
||||
a_next, \
|
||||
b_next ); \
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
bd11 = b11; (skipped if duplication is disabled)
|
||||
c11 = b11; */ \
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
PASTEMAC(ch,trsmukr)( a11, \
|
||||
b11, \
|
||||
bd11, \
|
||||
c11, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
|
||||
@@ -44,8 +44,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b21, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -53,17 +53,17 @@
|
||||
if ( incx_blas < 0 ) \
|
||||
{ \
|
||||
/* The semantics of negative stride in BLAS are that the vector
|
||||
operand be traversed in reverse order. (Another way to think of
|
||||
this is that negative strides effectively reverse the order of
|
||||
the vector, but without any explicit data movements.) This is
|
||||
also how BLIS interprets negative strides. The differences is
|
||||
that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */ \
|
||||
operand be traversed in reverse order. (Another way to think
|
||||
of this is that negative strides effectively reverse the order
|
||||
of the vector, but without any explicit data movements.) This
|
||||
is also how BLIS interprets negative strides. The differences
|
||||
is that with BLAS, the caller *always* passes in the 0th (i.e.,
|
||||
top-most or left-most) element of the vector, even when the
|
||||
stride is negative. By contrast, in BLIS, negative strides are
|
||||
used *relative* to the vector address as it is given. Thus, in
|
||||
BLIS, if this backwards traversal is desired, the caller *must*
|
||||
pass in the address to the (n-1)th (i.e., the bottom-most or
|
||||
right-most) element along with a negative stride. */ \
|
||||
x_blis = (x_blas) + (n-1)*(-incx_blas); \
|
||||
incx_blis = ( inc_t )(incx_blas); \
|
||||
} \
|
||||
|
||||
@@ -234,13 +234,6 @@
|
||||
#define bli_zpackkr BLIS_PACKDIM_KR_Z
|
||||
#define bli_zpacknr BLIS_PACKDIM_NR_Z
|
||||
|
||||
// Duplication factors
|
||||
|
||||
#define bli_sndup BLIS_DEFAULT_NUM_DUPL_S
|
||||
#define bli_dndup BLIS_DEFAULT_NUM_DUPL_D
|
||||
#define bli_cndup BLIS_DEFAULT_NUM_DUPL_C
|
||||
#define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z
|
||||
|
||||
// Incremental packing factors
|
||||
|
||||
#define bli_snifac BLIS_DEFAULT_NI_FAC
|
||||
|
||||
@@ -189,11 +189,6 @@ extern "C" {
|
||||
#include "bli_trsv.h"
|
||||
|
||||
|
||||
// -- Helper operands for ukernels --
|
||||
|
||||
#include "bli_dupl.h"
|
||||
|
||||
|
||||
// -- Level-3 operations --
|
||||
|
||||
#include "bli_gemm.h"
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname, varname ) \
|
||||
\
|
||||
void PASTEMAC0(opname)( \
|
||||
obj_t* b, \
|
||||
obj_t* bd \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC0(varname)( b, \
|
||||
bd ); \
|
||||
}
|
||||
|
||||
GENFRONT( dupl, DUPL_KERNEL )
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
dim_t k, \
|
||||
ctype* b, \
|
||||
ctype* bd \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,varname)( k, \
|
||||
b, \
|
||||
bd ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( dupl, DUPL_KERNEL )
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_dupl_unb_var1.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_dupl( obj_t* b,
|
||||
obj_t* bd );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
dim_t k, \
|
||||
ctype* b, \
|
||||
ctype* bd \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( dupl )
|
||||
|
||||
@@ -1,108 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T dupl_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
dim_t k,
|
||||
void* b,
|
||||
void* bd
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,dupl_unb_var1);
|
||||
|
||||
|
||||
void bli_dupl_unb_var1( obj_t* b,
|
||||
obj_t* bd )
|
||||
{
|
||||
num_t dt_b = bli_obj_datatype( *b );
|
||||
|
||||
dim_t k;
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_bd = bli_obj_buffer_at_off( *bd );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// The k dimension is the one that is "perpendicular" to the
|
||||
// storage dimension.
|
||||
if ( bli_obj_is_row_stored( *b ) ) k = bli_obj_length( *b );
|
||||
else k = bli_obj_width( *b );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_b];
|
||||
|
||||
// Invoke the function.
|
||||
f( k,
|
||||
buf_b,
|
||||
buf_bd );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname, kername ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* b, \
|
||||
void* bd \
|
||||
) \
|
||||
{ \
|
||||
ctype* b_cast = b; \
|
||||
ctype* bd_cast = bd; \
|
||||
\
|
||||
const dim_t NDUP = PASTEMAC(ch,ndup); \
|
||||
const dim_t NR = PASTEMAC(ch,nr); \
|
||||
const dim_t PACKNR = PASTEMAC(ch,packnr); \
|
||||
\
|
||||
dim_t i, j, el, d; \
|
||||
\
|
||||
for ( el = 0; el < n; ++el ) \
|
||||
{ \
|
||||
i = el / NR; \
|
||||
j = el % NR; \
|
||||
\
|
||||
for ( d = 0; d < NDUP; ++d ) \
|
||||
{ \
|
||||
*(bd_cast + el*NDUP + d) = *(b_cast + i*PACKNR + j); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( dupl_unb_var1, dupl_unb_var1 )
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_dupl_unb_var1( obj_t* b,
|
||||
obj_t* bd );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n, \
|
||||
void* b, \
|
||||
void* bd \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( dupl_unb_var1 )
|
||||
|
||||
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aL, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict bT, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
@@ -59,7 +58,7 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
aL, \
|
||||
bdT, \
|
||||
bT, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, \
|
||||
@@ -67,7 +66,6 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
bd, \
|
||||
c, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aL, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdT, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict bT, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -43,8 +43,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aR, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict bB, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
@@ -59,14 +58,13 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,gemmukr)( k, \
|
||||
minus_one, \
|
||||
aR, \
|
||||
bdB, \
|
||||
bB, \
|
||||
alpha, \
|
||||
b, rs_b, cs_b, \
|
||||
a_next, b_next ); \
|
||||
\
|
||||
PASTEMAC(ch,trsmukr)( a, \
|
||||
b, \
|
||||
bd, \
|
||||
c, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict aR, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict bdB, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict bB, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
|
||||
@@ -39,7 +39,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
|
||||
@@ -39,7 +39,6 @@
|
||||
void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict bd, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -1,152 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sdupl_opt_var1(
|
||||
dim_t n_elem,
|
||||
float* b,
|
||||
float* bd
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_ddupl_opt_var1(
|
||||
dim_t n_elem,
|
||||
double* b,
|
||||
double* bd
|
||||
)
|
||||
{
|
||||
dim_t n_iter = n_elem / 8;
|
||||
dim_t n_left = n_elem % 8;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
"movl %2, %%eax \n\t" // load address of b.
|
||||
"movl %3, %%ebx \n\t" // load address of bd.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movl %0, %%esi \n\t" // i = n_iter;
|
||||
"testl %%esi, %%esi \n\t" // check n_iter via logical AND.
|
||||
"je .CONSIDERNLEFT \n\t" // if i == 0, jump to code that
|
||||
" \n\t" // contains the n_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".LOOPNITER: \n\t" // MAIN LOOP
|
||||
" \n\t"
|
||||
"movddup 0 * 8(%%eax), %%xmm0 \n\t"
|
||||
"movddup 1 * 8(%%eax), %%xmm1 \n\t"
|
||||
"movddup 2 * 8(%%eax), %%xmm2 \n\t"
|
||||
"movddup 3 * 8(%%eax), %%xmm3 \n\t"
|
||||
"movddup 4 * 8(%%eax), %%xmm4 \n\t"
|
||||
"movddup 5 * 8(%%eax), %%xmm5 \n\t"
|
||||
"movddup 6 * 8(%%eax), %%xmm6 \n\t"
|
||||
"movddup 7 * 8(%%eax), %%xmm7 \n\t"
|
||||
"addl $64, %%eax \n\t" // b += 8;
|
||||
" \n\t"
|
||||
"movapd %%xmm0, 0 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm1, 1 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm2, 2 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm3, 3 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm4, 4 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm5, 5 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm6, 6 * 16(%%ebx) \n\t"
|
||||
"movapd %%xmm7, 7 * 16(%%ebx) \n\t"
|
||||
"addl $128, %%ebx \n\t" // bd += 16;
|
||||
" \n\t"
|
||||
"decl %%esi \n\t" // i -= 1;
|
||||
"jne .LOOPNITER \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".CONSIDERNLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movl %1, %%esi \n\t" // i = n_left;
|
||||
"testl %%esi, %%esi \n\t" // check n_left via logical AND.
|
||||
"je .DONE \n\t" // if i == 0, we're done; jump to end.
|
||||
" \n\t" // else, we prepare to enter n_left loop.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".LOOPNLEFT: \n\t" // EDGE LOOP
|
||||
" \n\t"
|
||||
"movddup 0 * 8(%%eax), %%xmm0 \n\t"
|
||||
"addl $8, %%eax \n\t" // b += 1;
|
||||
" \n\t"
|
||||
"movapd %%xmm0, 0 * 16(%%ebx) \n\t"
|
||||
"addl $16, %%ebx \n\t" // bd += 2;
|
||||
" \n\t"
|
||||
"decl %%esi \n\t" // i -= 1;
|
||||
"jne .LOOPNLEFT \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DONE: \n\t"
|
||||
" \n\t"
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"r" (n_iter),
|
||||
"r" (n_left),
|
||||
"m" (b),
|
||||
"m" (bd)
|
||||
: // register clobber list
|
||||
"eax", "ebx", "esi",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
"xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
void bli_cdupl_opt_var1(
|
||||
dim_t k,
|
||||
scomplex* b,
|
||||
scomplex* bd
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
void bli_zdupl_opt_var1(
|
||||
dim_t k,
|
||||
dcomplex* b,
|
||||
dcomplex* bd
|
||||
)
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
}
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2013, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t n_elem, \
|
||||
ctype* b, \
|
||||
ctype* bd \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( dupl_opt_var1 )
|
||||
|
||||
@@ -39,8 +39,7 @@ void bli_sgemmtrsm_l_opt_d4x4(
|
||||
float* restrict alpha,
|
||||
float* restrict a10,
|
||||
float* restrict a11,
|
||||
float* restrict bd01,
|
||||
float* restrict bd11,
|
||||
float* restrict b01,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
double* restrict alpha,
|
||||
double* restrict a10,
|
||||
double* restrict a11,
|
||||
double* restrict bd01,
|
||||
double* restrict bd11,
|
||||
double* restrict b01,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
@@ -73,8 +71,8 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
(
|
||||
" \n\t"
|
||||
"movq %2, %%rax \n\t" // load address of a10.
|
||||
"movq %4, %%rbx \n\t" // load address of bd01.
|
||||
//"movq %11, %%r9 \n\t" // load address of b_next.
|
||||
"movq %4, %%rbx \n\t" // load address of b01.
|
||||
//"movq %10, %%r9 \n\t" // load address of b_next.
|
||||
" \n\t"
|
||||
"subq $-8 * 16, %%rax \n\t" // increment pointers to allow byte
|
||||
"subq $-8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
|
||||
@@ -83,7 +81,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
"movaps -7 * 16(%%rax), %%xmm1 \n\t" // of a and b.
|
||||
"movaps -8 * 16(%%rbx), %%xmm2 \n\t"
|
||||
" \n\t"
|
||||
//"movq %7, %%rcx \n\t" // load address of c11
|
||||
//"movq %6, %%rcx \n\t" // load address of c11
|
||||
//"movq %9, %%rdi \n\t" // load cs_c
|
||||
//"leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double)
|
||||
//"leaq (%%rcx,%%rdi,2), %%rdx \n\t" // load address of c + 2*cs_c;
|
||||
@@ -320,7 +318,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rbx \n\t" // load address of b11.
|
||||
"movq %5, %%rbx \n\t" // load address of b11.
|
||||
" \n\t"
|
||||
" \n\t" // xmm8: xmm9: xmm10: xmm11:
|
||||
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
|
||||
@@ -354,7 +352,7 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
|
||||
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
|
||||
" \n\t"
|
||||
"movq %10, %%rax \n\t" // load address of alpha
|
||||
"movq %9, %%rax \n\t" // load address of alpha
|
||||
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
|
||||
" \n\t"
|
||||
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
|
||||
@@ -394,10 +392,10 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %3, %%rax \n\t" // load address of a11
|
||||
"movq %7, %%rcx \n\t" // load address of c11
|
||||
"movq %6, %%rcx \n\t" // load address of c11
|
||||
" \n\t"
|
||||
"movq %8, %%rsi \n\t" // load rs_c
|
||||
"movq %9, %%rdi \n\t" // load cs_c
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
|
||||
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
|
||||
" \n\t"
|
||||
@@ -514,18 +512,17 @@ void bli_dgemmtrsm_l_opt_d4x4(
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter),
|
||||
"m" (k_left),
|
||||
"m" (a10),
|
||||
"m" (a11),
|
||||
"m" (bd01),
|
||||
"m" (bd11),
|
||||
"m" (b11),
|
||||
"m" (c11),
|
||||
"m" (rs_c),
|
||||
"m" (cs_c),
|
||||
"m" (alpha),
|
||||
"m" (b_next)
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a10), // 2
|
||||
"m" (a11), // 3
|
||||
"m" (b01), // 4
|
||||
"m" (b11), // 5
|
||||
"m" (c11), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (alpha), // 9
|
||||
"m" (b_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -542,8 +539,7 @@ void bli_cgemmtrsm_l_opt_d4x4(
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a10,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd01,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b01,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
@@ -558,8 +554,7 @@ void bli_zgemmtrsm_l_opt_d4x4(
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a10,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd01,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b01,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
|
||||
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a10, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd01, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b01, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -39,8 +39,7 @@ void bli_sgemmtrsm_u_opt_d4x4(
|
||||
float* restrict alpha,
|
||||
float* restrict a12,
|
||||
float* restrict a11,
|
||||
float* restrict bd21,
|
||||
float* restrict bd11,
|
||||
float* restrict b21,
|
||||
float* restrict b11,
|
||||
float* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict a_next,
|
||||
@@ -55,8 +54,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
double* restrict alpha,
|
||||
double* restrict a12,
|
||||
double* restrict a11,
|
||||
double* restrict bd21,
|
||||
double* restrict bd11,
|
||||
double* restrict b21,
|
||||
double* restrict b11,
|
||||
double* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict a_next,
|
||||
@@ -73,7 +71,8 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
(
|
||||
" \n\t"
|
||||
"movq %2, %%rax \n\t" // load address of a12.
|
||||
"movq %4, %%rbx \n\t" // load address of bd21.
|
||||
"movq %4, %%rbx \n\t" // load address of b21.
|
||||
//"movq %10, %%r9 \n\t" // load address of b_next.
|
||||
" \n\t"
|
||||
"addq $8 * 16, %%rax \n\t" // increment pointers to allow byte
|
||||
"addq $8 * 16, %%rbx \n\t" // offsets in the unrolled iterations.
|
||||
@@ -302,7 +301,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rbx \n\t" // load address of b11.
|
||||
"movq %5, %%rbx \n\t" // load address of b11.
|
||||
" \n\t"
|
||||
" \n\t" // xmm8: xmm9: xmm10: xmm11:
|
||||
" \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
|
||||
@@ -336,7 +335,7 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
" \n\t" // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
|
||||
" \n\t" // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
|
||||
" \n\t"
|
||||
"movq %10, %%rax \n\t" // load address of alpha
|
||||
"movq %9, %%rax \n\t" // load address of alpha
|
||||
"movddup (%%rax), %%xmm15 \n\t" // load alpha and duplicate
|
||||
" \n\t"
|
||||
"movaps 0 * 16(%%rbx), %%xmm8 \n\t"
|
||||
@@ -376,10 +375,10 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %3, %%rax \n\t" // load address of a11
|
||||
"movq %7, %%rcx \n\t" // load address of c11
|
||||
"movq %6, %%rcx \n\t" // load address of c11
|
||||
" \n\t"
|
||||
"movq %8, %%rsi \n\t" // load rs_c
|
||||
"movq %9, %%rdi \n\t" // load cs_c
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
"salq $3, %%rsi \n\t" // rs_c *= sizeof( double )
|
||||
"salq $3, %%rdi \n\t" // cs_c *= sizeof( double )
|
||||
" \n\t"
|
||||
@@ -499,17 +498,17 @@ void bli_dgemmtrsm_u_opt_d4x4(
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter),
|
||||
"m" (k_left),
|
||||
"m" (a12),
|
||||
"m" (a11),
|
||||
"m" (bd21),
|
||||
"m" (bd11),
|
||||
"m" (b11),
|
||||
"m" (c11),
|
||||
"m" (rs_c),
|
||||
"m" (cs_c),
|
||||
"m" (alpha)
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a12), // 2
|
||||
"m" (a11), // 3
|
||||
"m" (b21), // 4
|
||||
"m" (b11), // 5
|
||||
"m" (c11), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c), // 8
|
||||
"m" (alpha) // 9
|
||||
"m" (b_next) // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
@@ -526,8 +525,7 @@ void bli_cgemmtrsm_u_opt_d4x4(
|
||||
scomplex* restrict alpha,
|
||||
scomplex* restrict a12,
|
||||
scomplex* restrict a11,
|
||||
scomplex* restrict bd21,
|
||||
scomplex* restrict bd11,
|
||||
scomplex* restrict b21,
|
||||
scomplex* restrict b11,
|
||||
scomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
scomplex* restrict a_next,
|
||||
@@ -542,8 +540,7 @@ void bli_zgemmtrsm_u_opt_d4x4(
|
||||
dcomplex* restrict alpha,
|
||||
dcomplex* restrict a12,
|
||||
dcomplex* restrict a11,
|
||||
dcomplex* restrict bd21,
|
||||
dcomplex* restrict bd11,
|
||||
dcomplex* restrict b21,
|
||||
dcomplex* restrict b11,
|
||||
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
|
||||
dcomplex* restrict a_next,
|
||||
|
||||
@@ -41,8 +41,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a12, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bd21, \
|
||||
ctype* restrict bd11, \
|
||||
ctype* restrict b21, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict a_next, \
|
||||
|
||||
@@ -3,7 +3,7 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major;
|
||||
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
|
||||
0 # Test all combinations of storage schemes?
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
sdcz #sdcz # Datatype(s) to test
|
||||
d #sdcz # Datatype(s) to test
|
||||
100 # Problem size: first to test
|
||||
300 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
|
||||
@@ -64,8 +64,7 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
|
||||
obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 );
|
||||
|
||||
@@ -82,13 +81,10 @@ void libblis_test_gemmtrsm_ukr_check( side_t side,
|
||||
void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11 );
|
||||
obj_t* b11 );
|
||||
|
||||
|
||||
void libblis_test_gemmtrsm_ukr_deps( test_params_t* params, test_op_t* op )
|
||||
@@ -166,10 +162,10 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
|
||||
obj_t kappa;
|
||||
obj_t alpha;
|
||||
obj_t a_big, a, b, bd;
|
||||
obj_t a_big, a, b;
|
||||
obj_t b11, c11;
|
||||
obj_t ap, bp;
|
||||
obj_t a1xp, a11p, bdx1, bd11, bx1p, b11p;
|
||||
obj_t a1xp, a11p, bx1p, b11p;
|
||||
obj_t c11_save;
|
||||
|
||||
|
||||
@@ -201,8 +197,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
sc_str[0], m, n, &c11 );
|
||||
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
|
||||
sc_str[0], m, n, &c11_save );
|
||||
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
|
||||
sc_b, k+m, 4*n, &bd );
|
||||
|
||||
// Set alpha.
|
||||
if ( bli_obj_is_real( b ) )
|
||||
@@ -264,8 +258,8 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
|
||||
|
||||
// Create subpartitions from the a and b panels.
|
||||
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp, &bd,
|
||||
&a1xp, &a11p, &bx1p, &b11p, &bdx1, &bd11 );
|
||||
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
|
||||
&a1xp, &a11p, &bx1p, &b11p );
|
||||
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
@@ -279,7 +273,7 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
time = bli_clock();
|
||||
|
||||
libblis_test_gemmtrsm_ukr_impl( impl, side, &alpha,
|
||||
&a1xp, &a11p, &bdx1, &bd11, &b11p, &c11 );
|
||||
&a1xp, &a11p, &bx1p, &b11p, &c11 );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
}
|
||||
@@ -304,7 +298,6 @@ void libblis_test_gemmtrsm_ukr_experiment( test_params_t* params,
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c11 );
|
||||
bli_obj_free( &c11_save );
|
||||
bli_obj_free( &bd );
|
||||
}
|
||||
|
||||
|
||||
@@ -314,15 +307,14 @@ void libblis_test_gemmtrsm_ukr_impl( mt_impl_t impl,
|
||||
obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 )
|
||||
{
|
||||
switch ( impl )
|
||||
{
|
||||
case BLIS_TEST_SEQ_UKERNEL:
|
||||
bli_gemmtrsm_ukr( alpha, a1x, a11, bdx1, bd11, b11, c11 );
|
||||
bli_gemmtrsm_ukr( alpha, a1x, a11, bx1, b11, c11 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -431,20 +423,16 @@ void libblis_test_gemmtrsm_ukr_check( side_t side,
|
||||
void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11 )
|
||||
obj_t* b11 )
|
||||
{
|
||||
dim_t mr = bli_obj_length( *a );
|
||||
dim_t nr = bli_obj_width( *b );
|
||||
|
||||
dim_t off_a1x, off_a11;
|
||||
dim_t off_bx1, off_b11;
|
||||
dim_t off_bdx1, off_bd11;
|
||||
|
||||
if ( bli_obj_is_lower( *a ) )
|
||||
{
|
||||
@@ -452,8 +440,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
off_a11 = k;
|
||||
off_bx1 = 0;
|
||||
off_b11 = k;
|
||||
off_bdx1 = 0;
|
||||
off_bd11 = k;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -461,8 +447,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
off_a11 = 0;
|
||||
off_bx1 = mr;
|
||||
off_b11 = 0;
|
||||
off_bdx1 = mr;
|
||||
off_bd11 = 0;
|
||||
}
|
||||
|
||||
bli_obj_init_subpart_from( *a, *a1x );
|
||||
@@ -488,28 +472,6 @@ void bli_gemmtrsm_ukr_make_subparts( dim_t k,
|
||||
// Set the diagonal offset of a11 to 0 (which overwrites the diagonal
|
||||
// offset value it inherited from a).
|
||||
bli_obj_set_diag_offset( 0, *a11 );
|
||||
|
||||
// If duplication is disabled, alias bdxx objects to bxx.
|
||||
if ( TRUE )
|
||||
{
|
||||
bli_obj_alias_to( *bx1, *bdx1 );
|
||||
bli_obj_alias_to( *b11, *bd11 );
|
||||
}
|
||||
else // if duplication is enabled
|
||||
{
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
bli_obj_init_subpart_from( *b, *bdx1 );
|
||||
bli_obj_set_dims( k, nr, *bdx1 );
|
||||
bli_obj_inc_offs( off_bdx1, 0, *bdx1 );
|
||||
|
||||
bli_obj_init_subpart_from( *b, *bd11 );
|
||||
bli_obj_set_dims( mr, nr, *bd11 );
|
||||
bli_obj_inc_offs( off_bd11, 0, *bd11 );
|
||||
|
||||
// Now update the buffer fields of bdx1, bd11, and then call
|
||||
// bli_dupl().
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -527,8 +489,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* alpha,
|
||||
void* a1x,
|
||||
void* a11,
|
||||
void* bdx1,
|
||||
void* bd11,
|
||||
void* bx1,
|
||||
void* b11,
|
||||
void* c11, inc_t rs_c, inc_t cs_c,
|
||||
void* a_next,
|
||||
@@ -542,8 +503,7 @@ static FUNCPTR_T GENARRAY(ftypes_u,gemmtrsm_u_ukr);
|
||||
void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 )
|
||||
{
|
||||
@@ -555,9 +515,7 @@ void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
|
||||
void* buf_a11 = bli_obj_buffer_at_off( *a11 );
|
||||
|
||||
void* buf_bdx1 = bli_obj_buffer_at_off( *bdx1 );
|
||||
|
||||
void* buf_bd11 = bli_obj_buffer_at_off( *bd11 );
|
||||
void* buf_bx1 = bli_obj_buffer_at_off( *bx1 );
|
||||
|
||||
void* buf_b11 = bli_obj_buffer_at_off( *b11 );
|
||||
|
||||
@@ -579,12 +537,11 @@ void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
buf_alpha,
|
||||
buf_a1x,
|
||||
buf_a11,
|
||||
buf_bdx1,
|
||||
buf_bd11,
|
||||
buf_bx1,
|
||||
buf_b11,
|
||||
buf_c11, rs_c, cs_c,
|
||||
buf_a1x,
|
||||
buf_bdx1 );
|
||||
buf_bx1 );
|
||||
}
|
||||
|
||||
|
||||
@@ -596,8 +553,7 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bdx1, \
|
||||
void* bd11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
void* a_next, \
|
||||
@@ -608,8 +564,7 @@ void PASTEMAC(ch,varname)( \
|
||||
alpha, \
|
||||
a1x, \
|
||||
a11, \
|
||||
bdx1, \
|
||||
bd11, \
|
||||
bx1, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
a_next, \
|
||||
|
||||
@@ -40,8 +40,7 @@ void libblis_test_gemmtrsm_ukr( test_params_t* params, test_op_t* op );
|
||||
void bli_gemmtrsm_ukr( obj_t* alpha,
|
||||
obj_t* a1x,
|
||||
obj_t* a11,
|
||||
obj_t* bdx1,
|
||||
obj_t* bd11,
|
||||
obj_t* bx1,
|
||||
obj_t* b11,
|
||||
obj_t* c11 );
|
||||
|
||||
@@ -53,8 +52,7 @@ void PASTEMAC(ch,varname)( \
|
||||
void* alpha, \
|
||||
void* a1x, \
|
||||
void* a11, \
|
||||
void* bdx1, \
|
||||
void* bd11, \
|
||||
void* bx1, \
|
||||
void* b11, \
|
||||
void* c11, inc_t rs_c, inc_t cs_c, \
|
||||
void* a_next, \
|
||||
|
||||
@@ -653,18 +653,6 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
BLIS_DEFAULT_NI_Z );
|
||||
*/
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-3 packing duplication s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_NUM_DUPL_S,
|
||||
BLIS_DEFAULT_NUM_DUPL_D,
|
||||
BLIS_DEFAULT_NUM_DUPL_C,
|
||||
BLIS_DEFAULT_NUM_DUPL_Z );
|
||||
libblis_test_fprintf_c( os, " elements per register %5u %5u %5u %5u\n",
|
||||
BLIS_NUM_ELEM_PER_REG_S,
|
||||
BLIS_NUM_ELEM_PER_REG_D,
|
||||
BLIS_NUM_ELEM_PER_REG_C,
|
||||
BLIS_NUM_ELEM_PER_REG_Z );
|
||||
libblis_test_fprintf_c( os, "\n" );
|
||||
libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" );
|
||||
libblis_test_fprintf_c( os, " m dimension %5u %5u %5u %5u\n",
|
||||
BLIS_DEFAULT_L2_MC_S,
|
||||
|
||||
@@ -63,7 +63,6 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
|
||||
side_t side,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* c );
|
||||
|
||||
void libblis_test_trsm_ukr_check( side_t side,
|
||||
@@ -148,7 +147,7 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
|
||||
uplo_t uploa;
|
||||
|
||||
obj_t kappa;
|
||||
obj_t a, b, bd, c;
|
||||
obj_t a, b, c;
|
||||
obj_t ap, bp;
|
||||
obj_t c_save;
|
||||
|
||||
@@ -177,8 +176,6 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
|
||||
sc_str[0], m, n, &c );
|
||||
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
|
||||
sc_str[0], m, n, &c_save );
|
||||
libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
|
||||
sc_b, m, 4*n, &bd );
|
||||
|
||||
// Set the structure, uplo, and diagonal offset properties of A.
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, a );
|
||||
@@ -229,14 +226,11 @@ void libblis_test_trsm_ukr_experiment( test_params_t* params,
|
||||
// Re-pack the contents of b to bp.
|
||||
bli_packm_blk_var2( &BLIS_ONE, &b, &bp );
|
||||
|
||||
// Re-duplicate the contents of bp to bd.
|
||||
bli_dupl( &bp, &bd );
|
||||
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &bd, &c );
|
||||
libblis_test_trsm_ukr_impl( impl, side, &ap, &bp, &c );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
}
|
||||
@@ -268,13 +262,12 @@ void libblis_test_trsm_ukr_impl( mt_impl_t impl,
|
||||
side_t side,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* c )
|
||||
{
|
||||
switch ( impl )
|
||||
{
|
||||
case BLIS_TEST_SEQ_UKERNEL:
|
||||
bli_trsm_ukr( a, b, bd, c );
|
||||
bli_trsm_ukr( a, b, c );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -386,7 +379,6 @@ void libblis_test_trsm_ukr_check( side_t side,
|
||||
typedef void (*FUNCPTR_T)(
|
||||
void* a,
|
||||
void* b,
|
||||
void* bd,
|
||||
void* c, inc_t rs_c, inc_t cs_c
|
||||
);
|
||||
|
||||
@@ -396,7 +388,6 @@ static FUNCPTR_T GENARRAY(ftypes_u,trsm_u_ukr);
|
||||
|
||||
void bli_trsm_ukr( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* c )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
@@ -405,8 +396,6 @@ void bli_trsm_ukr( obj_t* a,
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( *b );
|
||||
|
||||
void* buf_bd = bli_obj_buffer_at_off( *bd );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
@@ -421,7 +410,6 @@ void bli_trsm_ukr( obj_t* a,
|
||||
// Invoke the function.
|
||||
f( buf_a,
|
||||
buf_b,
|
||||
buf_bd,
|
||||
buf_c, rs_c, cs_c );
|
||||
}
|
||||
|
||||
@@ -432,13 +420,11 @@ void bli_trsm_ukr( obj_t* a,
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* bd, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
PASTEMAC(ch,ukrname)( a, \
|
||||
b, \
|
||||
bd, \
|
||||
c, rs_c, cs_c ); \
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ void libblis_test_trsm_ukr( test_params_t* params, test_op_t* op );
|
||||
//
|
||||
void bli_trsm_ukr( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* bd,
|
||||
obj_t* c );
|
||||
|
||||
#undef GENTPROT
|
||||
@@ -48,7 +47,6 @@ void bli_trsm_ukr( obj_t* a,
|
||||
void PASTEMAC(ch,varname)( \
|
||||
void* a, \
|
||||
void* b, \
|
||||
void* bd, \
|
||||
void* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
|
||||
@@ -146,34 +146,6 @@
|
||||
#define BLIS_EXTEND_KR_C 0
|
||||
#define BLIS_EXTEND_KR_Z 0
|
||||
|
||||
// -- Number of elements per vector register --
|
||||
|
||||
// NOTE: These constants are typically only used to determine the amount
|
||||
// of duplication needed when configuring level-3 macro-kernels that
|
||||
// copy and duplicate elements of B to a temporary duplication buffer
|
||||
// (so that element-wise vector multiplication and addition instructions
|
||||
// can be used).
|
||||
|
||||
#define BLIS_NUM_ELEM_PER_REG_S 4
|
||||
#define BLIS_NUM_ELEM_PER_REG_D 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_C 2
|
||||
#define BLIS_NUM_ELEM_PER_REG_Z 1
|
||||
|
||||
// -- Default switch for duplication of B --
|
||||
|
||||
// NOTE: Setting these values to 1 disables duplication. Any value
|
||||
// d > 1 results in a d-1 duplicates created within special macro-kernel
|
||||
// buffer of dimension k x NR*d.
|
||||
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_S BLIS_NUM_ELEM_PER_REG_S
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_D BLIS_NUM_ELEM_PER_REG_D
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_C BLIS_NUM_ELEM_PER_REG_C
|
||||
//#define BLIS_DEFAULT_NUM_DUPL_Z BLIS_NUM_ELEM_PER_REG_Z
|
||||
#define BLIS_DEFAULT_NUM_DUPL_S 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_D 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_C 1
|
||||
#define BLIS_DEFAULT_NUM_DUPL_Z 1
|
||||
|
||||
// -- Default incremental packing blocksizes (n dimension) --
|
||||
|
||||
// NOTE: These incremental packing blocksizes (for the n dimension) are only
|
||||
@@ -259,10 +231,6 @@
|
||||
|
||||
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
|
||||
|
||||
// -- dupl --
|
||||
|
||||
#define DUPL_KERNEL dupl_unb_var1
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define GEMM_UKERNEL gemm_ref_mxn
|
||||
|
||||
Reference in New Issue
Block a user