diff --git a/config/clarksville/bli_config.h b/config/clarksville/bli_config.h index ff5826d0a..04d168854 100644 --- a/config/clarksville/bli_config.h +++ b/config/clarksville/bli_config.h @@ -62,7 +62,7 @@ // contiguous memory pools. #define BLIS_NUM_MC_X_KC_BLOCKS BLIS_MAX_NUM_THREADS #define BLIS_NUM_KC_X_NC_BLOCKS 1 -#define BLIS_NUM_MC_X_NC_BLOCKS 0 +#define BLIS_NUM_MC_X_NC_BLOCKS 1 // The maximum preload byte offset is used to pad the end of the contiguous // memory pools so that the micro-kernel, when computing with the end of the diff --git a/config/clarksville/bli_kernel.h b/config/clarksville/bli_kernel.h index 208b4667e..4ed99b994 100644 --- a/config/clarksville/bli_kernel.h +++ b/config/clarksville/bli_kernel.h @@ -60,7 +60,7 @@ #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 256 -#define BLIS_DEFAULT_NC_D 600 +#define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 @@ -176,7 +176,7 @@ // used by certain blocked variants. But when the *are* used, they MUST be // be an integer multiple of NR! -#define BLIS_DEFAULT_NI_FAC 16 +#define BLIS_DEFAULT_NI_FAC 1 #define BLIS_DEFAULT_NI_S (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S) #define BLIS_DEFAULT_NI_D (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D) #define BLIS_DEFAULT_NI_C (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index c217a34ee..41d882f65 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -373,8 +373,12 @@ void bli_packm_init_pack( bool_t densify, } else { - // If the pack schema is something else, we assume stride information - // of p is set later on, by the implementation. + // NOTE: When implementing block storage, we only need to implement + // the following two cases: + // - row-stored blocks in row-major order + // - column-stored blocks in column-major order + // The other two combinations coincide with that of packed row-panel + // and packed column- panel storage. size_p = 0; } diff --git a/frame/3/gemm/bli_gemm.c b/frame/3/gemm/bli_gemm.c index 00796210f..802dd4cbe 100644 --- a/frame/3/gemm/bli_gemm.c +++ b/frame/3/gemm/bli_gemm.c @@ -34,7 +34,9 @@ #include "blis.h" -extern gemm_t* gemm_cntl; +extern gemm_t* gemm_cntl; +extern gemm_t* gemm_cntl_packa; +extern blksz_t* gemm_mc; // // Define object-based interface. @@ -110,6 +112,15 @@ void bli_gemm( obj_t* alpha, // Choose the control tree. cntl = gemm_cntl; +#if 0 + if ( bli_obj_length_after_trans( c_local ) <= + bli_blksz_total_for_obj( &c_local, gemm_mc ) ) + { + cntl = gemm_cntl_packa; + + } +#endif + // Invoke the internal back-end. bli_gemm_int( &alpha_local, &a_local, diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h index d5737c184..1dcd1f769 100644 --- a/frame/3/gemm/bli_gemm.h +++ b/frame/3/gemm/bli_gemm.h @@ -43,6 +43,7 @@ #include "bli_gemm_blk_var4.h" #include "bli_gemm_ker_var2.h" +#include "bli_gemm_ker_var5.h" #include "bli_gemm_ref_mxn.h" diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 197d99e9b..1074bb0c5 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -37,12 +37,18 @@ extern scalm_t* scalm_cntl; gemm_t* gemm_cntl; +gemm_t* gemm_cntl_packa; gemm_t* gemm_cntl_bp_ke; gemm_t* gemm_cntl_op_bp; gemm_t* gemm_cntl_mm_op; gemm_t* gemm_cntl_vl_mm; +gemm_t* gemm_cntl_bp_ke5; +gemm_t* gemm_cntl_pm_bp; +gemm_t* gemm_cntl_mm_pm; +gemm_t* gemm_cntl_vl_mm5; + packm_t* gemm_packa_cntl; packm_t* gemm_packb_cntl; packm_t* gemm_packc_cntl; @@ -138,7 +144,7 @@ void bli_gemm_cntl_init() FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COLUMNS, - BLIS_BUFFER_FOR_GEN_USE ); + BLIS_BUFFER_FOR_C_PANEL ); gemm_unpackc_cntl = @@ -147,6 +153,10 @@ void bli_gemm_cntl_init() NULL ); // no blocksize needed + // + // Create a control tree for packing A and B, and streaming C. + // + // Create control tree object for lowest-level block-panel kernel. gemm_cntl_bp_ke = @@ -160,7 +170,6 @@ void bli_gemm_cntl_init() gemm_cntl_op_bp = bli_gemm_cntl_obj_create( BLIS_BLOCKED, - //BLIS_VARIANT4, // var1 with incremental pack in iter 0 BLIS_VARIANT1, gemm_mc, gemm_ni, @@ -180,7 +189,7 @@ void bli_gemm_cntl_init() gemm_kc, NULL, NULL, - NULL, + NULL, NULL, NULL, gemm_cntl_op_bp, @@ -203,6 +212,60 @@ void bli_gemm_cntl_init() // Alias the "master" gemm control tree to a shorter name. gemm_cntl = gemm_cntl_vl_mm; + + + // + // Create a control tree for packing A, and streaming B and C. + // + + gemm_cntl_bp_ke5 + = + bli_gemm_cntl_obj_create( BLIS_UNB_OPT, + BLIS_VARIANT5, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL ); + gemm_cntl_pm_bp + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT3, + gemm_kc, + NULL, + NULL, + gemm_packa_cntl, + NULL, + //gemm_packc_cntl, + NULL, + gemm_cntl_bp_ke5, + //gemm_unpackc_cntl ); + NULL ); + + gemm_cntl_mm_pm + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT1, + gemm_mc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm_cntl_pm_bp, + NULL ); + + gemm_cntl_vl_mm5 + = + bli_gemm_cntl_obj_create( BLIS_BLOCKED, + BLIS_VARIANT2, + gemm_nc, + NULL, + NULL, + NULL, + NULL, + NULL, + gemm_cntl_mm_pm, + NULL ); + + gemm_cntl_packa = gemm_cntl_vl_mm5; } void bli_gemm_cntl_finalize() @@ -224,6 +287,11 @@ void bli_gemm_cntl_finalize() bli_cntl_obj_free( gemm_cntl_op_bp ); bli_cntl_obj_free( gemm_cntl_mm_op ); bli_cntl_obj_free( gemm_cntl_vl_mm ); + + bli_cntl_obj_free( gemm_cntl_bp_ke5 ); + bli_cntl_obj_free( gemm_cntl_pm_bp ); + bli_cntl_obj_free( gemm_cntl_mm_pm ); + bli_cntl_obj_free( gemm_cntl_vl_mm5 ); } gemm_t* bli_gemm_cntl_obj_create( impl_t impl_type, diff --git a/frame/3/gemm/bli_gemm_int.c b/frame/3/gemm/bli_gemm_int.c index 1b7d726de..ebcff79dc 100644 --- a/frame/3/gemm/bli_gemm_int.c +++ b/frame/3/gemm/bli_gemm_int.c @@ -50,7 +50,7 @@ static FUNCPTR_T vars[6][3] = { NULL, bli_gemm_ker_var2, bli_gemm_blk_var2 }, { NULL, NULL, bli_gemm_blk_var3 }, { NULL, NULL, bli_gemm_blk_var4 }, - { NULL, NULL, NULL }, + { NULL, bli_gemm_ker_var5, NULL }, { NULL, NULL, NULL } }; diff --git a/frame/3/gemm/bli_gemm_ker_var5.c b/frame/3/gemm/bli_gemm_ker_var5.c new file mode 100644 index 000000000..100e0bbc6 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var5.c @@ -0,0 +1,356 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemm_fp + +typedef void (*FUNCPTR_T)( + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c + ); + +static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5); + + +void bli_gemm_ker_var5( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + gemm_t* cntl ) +{ + num_t dt_exec = bli_obj_execution_datatype( *c ); + + dim_t m = bli_obj_length( *c ); + dim_t n = bli_obj_width( *c ); + dim_t k = bli_obj_width( *a ); + + void* buf_a = bli_obj_buffer_at_off( *a ); + inc_t rs_a = bli_obj_row_stride( *a ); + inc_t cs_a = bli_obj_col_stride( *a ); + inc_t ps_a = bli_obj_panel_stride( *a ); + + void* buf_b = bli_obj_buffer_at_off( *b ); + inc_t rs_b = bli_obj_row_stride( *b ); + inc_t cs_b = bli_obj_col_stride( *b ); + inc_t ps_b = bli_obj_panel_stride( *b ); + + void* buf_c = bli_obj_buffer_at_off( *c ); + inc_t rs_c = bli_obj_row_stride( *c ); + inc_t cs_c = bli_obj_col_stride( *c ); + + num_t dt_alpha; + void* buf_alpha; + + num_t dt_beta; + void* buf_beta; + + FUNCPTR_T f; + +/* + // Handle the special case where c and a are complex and b is real. + // Note that this is the ONLY case allowed by the inner kernel whereby + // the datatypes of a and b differ. In this situation, the execution + // datatype is real, so we need to inflate (by a factor of two): + // - the m dimension, + // - the column stride of c, + // - the column stride (ie: the panel length) of a, and + // - the panel stride of a. + if ( bli_obj_is_complex( *a ) && bli_obj_is_real( *b ) ) + { + m *= 2; + cs_c *= 2; + cs_a *= 2; + ps_a *= 2; + } +*/ + + // If alpha is a scalar constant, use dt_exec to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the alpha object and extract the buffer at the alpha offset. + bli_set_scalar_dt_buffer( alpha, dt_exec, dt_alpha, buf_alpha ); + + // If beta is a scalar constant, use dt_exec to extract the address of the + // corresponding constant value; otherwise, use the datatype encoded + // within the beta object and extract the buffer at the beta offset. + bli_set_scalar_dt_buffer( beta, dt_exec, dt_beta, buf_beta ); + + // Index into the type combination array to extract the correct + // function pointer. + f = ftypes[dt_exec]; + + // Invoke the function. + f( m, + n, + k, + buf_alpha, + buf_a, rs_a, cs_a, ps_a, + buf_b, rs_b, cs_b, ps_b, + buf_beta, + buf_c, rs_c, cs_c ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname, ukrname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c \ + ) \ +{ \ + /* Temporary buffer for duplicating elements of B. */ \ + ctype bd[ PASTEMAC(ch,maxkc) * \ + PASTEMAC(ch,packnr) * \ + PASTEMAC(ch,nifac) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + ctype* restrict bp; \ +\ + /* Temporary C buffer for edge cases. */ \ + ctype ct[ PASTEMAC(ch,mr) * \ + PASTEMAC(ch,nr) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const inc_t rs_ct = 1; \ + const inc_t cs_ct = PASTEMAC(ch,mr); \ +\ + /* Alias some constants to shorter names. */ \ + const dim_t MR = PASTEMAC(ch,mr); \ + const dim_t NR = PASTEMAC(ch,nr); \ + const dim_t PACKNR = PASTEMAC(ch,packnr); \ +\ + ctype* restrict one = PASTEMAC(ch,1); \ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict a1; \ + ctype* restrict b1; \ + ctype* restrict c1; \ + ctype* restrict c11; \ + ctype* restrict a2; \ + ctype* restrict b2; \ +\ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t i, j; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == GEMM_MR + ps_a == stride to next row panel of A + rs_b == GEMM_NR + cs_b == 1 + ps_b == stride to next column panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = cs_b * NR; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + b1 = b_cast; \ + c1 = c_cast; \ +\ + /* The current packed micro-panel of B will always be stored in bd. */ \ + bp = bd; \ +\ + /* Since we pack micro-panels of B incrementall, one at a time, the + address of the next micro-panel of B remains constant. */ \ + b2 = bd; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = 0; j < n_iter; ++j ) \ + { \ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Incrementally pack a single micro-panel of B. */ \ + PASTEMAC(ch,packm_cxk)( BLIS_NO_CONJUGATE, \ + NR, \ + k, \ + one, \ + b1, cs_b, rs_b, \ + bp, PACKNR ); \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + { \ + a2 = a_cast; \ + } \ +\ + /* Invoke the gemm micro-kernel. */ \ + PASTEMAC(ch,ukrname)( k, \ + alpha_cast, \ + a1, \ + bp, \ + beta_cast, \ + c11, rs_c, cs_c, \ + a2, b2 ); \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + /* Bottom edge handling. */ \ + if ( m_left ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a_cast; \ +\ + /* Invoke the gemm micro-kernel. */ \ + PASTEMAC(ch,ukrname)( k, \ + alpha_cast, \ + a1, \ + bp, \ + zero, \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ +\ + /* Scale the bottom edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_left, NR, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ +\ + b1 += cstep_b; \ + c1 += cstep_c; \ + } \ +\ + if ( n_left ) \ + { \ + a1 = a_cast; \ + c11 = c1; \ +\ + /* Right edge loop over the m dimension (MR rows at a time). */ \ + for ( i = 0; i < m_iter; ++i ) \ + { \ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = a1 + rstep_a; \ + if ( i == m_iter - 1 && m_left == 0 ) \ + { \ + a2 = a_cast; \ + } \ +\ + /* Invoke the gemm micro-kernel. */ \ + PASTEMAC(ch,ukrname)( k, \ + alpha_cast, \ + a1, \ + bp, \ + zero, \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ +\ + /* Scale the right edge of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( MR, n_left, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ +\ + a1 += rstep_a; \ + c11 += rstep_c; \ + } \ +\ + /* Bottom-right corner handling. */ \ + if ( m_left ) \ + { \ + /* Compute the address of the next panel of A. */ \ + a2 = a_cast; \ +\ + /* Invoke the gemm micro-kernel. */ \ + PASTEMAC(ch,ukrname)( k, \ + alpha_cast, \ + a1, \ + bp, \ + zero, \ + ct, rs_ct, cs_ct, \ + a2, b2 ); \ +\ + /* Scale the bottom-right corner of C and add the result from above. */ \ + PASTEMAC(ch,xpbys_mxn)( m_left, n_left, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + } \ +\ +/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ +PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ +} + +INSERT_GENTFUNC_BASIC( gemm_ker_var5, GEMM_UKERNEL ) + diff --git a/frame/3/gemm/bli_gemm_ker_var5.h b/frame/3/gemm/bli_gemm_ker_var5.h new file mode 100644 index 000000000..48b0d58ce --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var5.h @@ -0,0 +1,65 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2013, The University of Texas + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + +// +// Prototype object-based interface. +// +void bli_gemm_ker_var5( obj_t* alpha, + obj_t* a, + obj_t* b, + obj_t* beta, + obj_t* c, + gemm_t* cntl ); + + +// +// Prototype BLAS-like interfaces. +// +#undef GENTPROT +#define GENTPROT( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname)( \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t rs_a, inc_t cs_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t cs_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c \ + ); + +INSERT_GENTPROT_BASIC( gemm_ker_var5 ) + diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index bbddcf409..89478c4c2 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -243,6 +243,12 @@ #define bli_cndup BLIS_DEFAULT_NUM_DUPL_C #define bli_zndup BLIS_DEFAULT_NUM_DUPL_Z +// Incremental packing factors + +#define bli_snifac BLIS_DEFAULT_NI_FAC +#define bli_dnifac BLIS_DEFAULT_NI_FAC +#define bli_cnifac BLIS_DEFAULT_NI_FAC +#define bli_znifac BLIS_DEFAULT_NI_FAC #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 1f3defc18..f593b3920 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -89,6 +89,7 @@ #define BLIS_BITVAL_PACKED_COLUMNS 0x40000 #define BLIS_BITVAL_PACKED_ROW_PANELS 0x50000 #define BLIS_BITVAL_PACKED_COL_PANELS 0x60000 +#define BLIS_BITVAL_PACKED_BLOCKS 0x70000 #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER 0x80000 #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 @@ -195,7 +196,8 @@ typedef enum BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS + BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, + BLIS_PACKED_BLOCKS = BLIS_BITVAL_PACKED_BLOCKS } pack_t; diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index afae782e5..49846a95e 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -534,6 +534,15 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) BLIS_EXTEND_NR_D, BLIS_EXTEND_NR_C, BLIS_EXTEND_NR_Z ); +/* + libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "level-3 incremental packing blocksizes \n" ); + libblis_test_fprintf_c( os, " n dimension %5u %5u %5u %5u\n", + BLIS_DEFAULT_NI_S, + BLIS_DEFAULT_NI_D, + BLIS_DEFAULT_NI_C, + BLIS_DEFAULT_NI_Z ); +*/ libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-3 packing duplication \n" ); libblis_test_fprintf_c( os, " dupl. factors for B %5u %5u %5u %5u\n",