Added option of slab or rr partitioning in jr/ir.

Details: - Updated existing macrokernel function names and definitions to explicitly use slab assignment of micropanels to threads, then created duplicate versions of macrokernels that explicitly use round-robin assignment instead of slab. NOTE: As in ac18949, trsm_r macrokernels were not substantially updated in this commit because they are currently disabled in bli_trsm_front.c. - Updated existing packing function (in blk_packm_blk_var1.c) to explicitly use slab partitioning, and then duplicated for round-robin. - Updated control tree initialization to use the appropriate macrokernel and packm function pointers depending on which method (slab or rr) was enabled at configure-time. - Updated configure script to accept new --thread-part-jrir=[slab|rr] option (-m [slab|rr] for short), which allows the user to explicitly request either slab or round-robin assignment (partitioning) of micropanels to threads. - Updated sandbox/ref99 according to above changes. - Minor updates to build/add-copyright.py.
2026-04-24 17:48:50 +00:00 · 2018-10-07 20:30:32 -05:00
parent 98e01ea04b
commit c92762ecdc
48 changed files with 7477 additions and 1394 deletions
--- a/sandbox/ref99/cntl/blx_gemm_cntl.c
+++ b/sandbox/ref99/cntl/blx_gemm_cntl.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -54,7 +55,28 @@ cntl_t* blx_gemmbp_cntl_create
       pack_t schema_b
     )
 {
-	void* macro_kernel_p = blx_gemm_ker_var2;
+	void* macro_kernel_fp;
+	void* packa_fp;
+	void* packb_fp;
+
+#ifdef BLIS_ENABLE_JRIR_SLAB
+
+	// Use the function pointers to the macrokernels that use slab
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_fp = blx_gemm_ker_var2sl;
+
+	packa_fp = bli_packm_blk_var1sl;
+	packb_fp = bli_packm_blk_var1sl;
+
+#else // BLIS_ENABLE_JRIR_RR
+
+	// Use the function pointers to the macrokernels that use round-robin
+	// assignment of micropanels to threads in the jr and ir loops.
+	macro_kernel_fp = bli_gemm_ker_var2rr;
+
+	packa_fp = bli_packm_blk_var1rr;
+	packb_fp = bli_packm_blk_var1rr;
+#endif

 	// Create two nodes for the macro-kernel.
 	cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
@@ -69,7 +91,7 @@ cntl_t* blx_gemmbp_cntl_create
 	(
 	  family,
 	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
+	  macro_kernel_fp,
 	  gemm_cntl_bu_ke
 	);

@@ -77,7 +99,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packa,  // pack the left-hand operand
-	  bli_packm_blk_var1,
+	  packa_fp,
 	  BLIS_MR,
 	  BLIS_KR,
 	  FALSE,   // do NOT invert diagonal
@@ -101,7 +123,7 @@ cntl_t* blx_gemmbp_cntl_create
 	cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
 	(
 	  blx_gemm_packb,  // pack the right-hand operand
-	  bli_packm_blk_var1,
+	  packb_fp,
 	  BLIS_KR,
 	  BLIS_NR,
 	  FALSE,   // do NOT invert diagonal
--- a/sandbox/ref99/vars/blx_gemm_ker_var2rr.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2rr.c
@@ -59,14 +59,14 @@ typedef void (*gemm_fp)
 // Function pointer array for datatype-specific functions.
 static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
 {
-    PASTECH2(blx_,s,gemm_ker_var2),
-    PASTECH2(blx_,c,gemm_ker_var2),
-    PASTECH2(blx_,d,gemm_ker_var2),
-    PASTECH2(blx_,z,gemm_ker_var2)
+    PASTECH2(blx_,s,gemm_ker_var2rr),
+    PASTECH2(blx_,c,gemm_ker_var2rr),
+    PASTECH2(blx_,d,gemm_ker_var2rr),
+    PASTECH2(blx_,z,gemm_ker_var2rr)
 };


-void blx_gemm_ker_var2
+void blx_gemm_ker_var2rr
     (
       obj_t*  a,
       obj_t*  b,
@@ -272,8 +272,8 @@ void PASTECH2(blx_,ch,varname) \
 	dim_t jr_inc,   ir_inc; \
 \
 	/* Determine the thread range and increment for each thrinfo_t node. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
 \
 	/* Loop over the n dimension (NR columns at a time). */ \
 	for ( j = jr_start; j < jr_end; j += jr_inc ) \
@@ -302,11 +302,11 @@ void PASTECH2(blx_,ch,varname) \
 \
 			/* Compute the addresses of the next panels of A and B. */ \
 			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
+			if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
 			{ \
 				a2 = a_cast; \
 				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
+				if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
 					b2 = b_cast; \
 			} \
 \
@@ -363,11 +363,11 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c,
 }

 #if 0
-GENTFUNC( float,    s, gemm_ker_var2 )
-GENTFUNC( double,   d, gemm_ker_var2 )
-GENTFUNC( scomplex, c, gemm_ker_var2 )
-GENTFUNC( dcomplex, z, gemm_ker_var2 )
+GENTFUNC( float,    s, gemm_ker_var2rr )
+GENTFUNC( double,   d, gemm_ker_var2rr )
+GENTFUNC( scomplex, c, gemm_ker_var2rr )
+GENTFUNC( dcomplex, z, gemm_ker_var2rr )
 #else
-INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
 #endif

--- a/sandbox/ref99/vars/blx_gemm_ker_var2sl.c
+++ b/sandbox/ref99/vars/blx_gemm_ker_var2sl.c
@@ -0,0 +1,373 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name of The University of Texas at Austin nor the names
+      of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "blix.h"
+
+// Function pointer type for datatype-specific functions.
+typedef void (*gemm_fp)
+     (
+       pack_t  schema_a,
+       pack_t  schema_b,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t cs_a, inc_t is_a,
+                  dim_t pd_a, inc_t ps_a,
+       void*   b, inc_t rs_b, inc_t is_b,
+                  dim_t pd_b, inc_t ps_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
+     );
+
+// Function pointer array for datatype-specific functions.
+static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
+{
+    PASTECH2(blx_,s,gemm_ker_var2sl),
+    PASTECH2(blx_,c,gemm_ker_var2sl),
+    PASTECH2(blx_,d,gemm_ker_var2sl),
+    PASTECH2(blx_,z,gemm_ker_var2sl)
+};
+
+
+void blx_gemm_ker_var2sl
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       cntl_t* cntl,
+       thrinfo_t* thread
+     )
+{
+	num_t     dt_exec   = bli_obj_exec_dt( c );
+
+	pack_t    schema_a  = bli_obj_pack_schema( a );
+	pack_t    schema_b  = bli_obj_pack_schema( b );
+
+	dim_t     m         = bli_obj_length( c );
+	dim_t     n         = bli_obj_width( c );
+	dim_t     k         = bli_obj_width( a );
+
+	void*     buf_a     = bli_obj_buffer_at_off( a );
+	inc_t     cs_a      = bli_obj_col_stride( a );
+	inc_t     is_a      = bli_obj_imag_stride( a );
+	dim_t     pd_a      = bli_obj_panel_dim( a );
+	inc_t     ps_a      = bli_obj_panel_stride( a );
+
+	void*     buf_b     = bli_obj_buffer_at_off( b );
+	inc_t     rs_b      = bli_obj_row_stride( b );
+	inc_t     is_b      = bli_obj_imag_stride( b );
+	dim_t     pd_b      = bli_obj_panel_dim( b );
+	inc_t     ps_b      = bli_obj_panel_stride( b );
+
+	void*     buf_c     = bli_obj_buffer_at_off( c );
+	inc_t     rs_c      = bli_obj_row_stride( c );
+	inc_t     cs_c      = bli_obj_col_stride( c );
+
+	obj_t     scalar_a;
+	obj_t     scalar_b;
+
+	void*     buf_alpha;
+	void*     buf_beta;
+
+	gemm_fp   f;
+
+	// Detach and multiply the scalars attached to A and B.
+	bli_obj_scalar_detach( a, &scalar_a );
+	bli_obj_scalar_detach( b, &scalar_b );
+	bli_mulsc( &scalar_a, &scalar_b );
+
+	// Grab the addresses of the internal scalar buffers for the scalar
+	// merged above and the scalar attached to C.
+	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	buf_beta  = bli_obj_internal_scalar_buffer( c );
+
+	// Index into the type combination array to extract the correct
+	// function pointer.
+	f = ftypes[dt_exec];
+
+	// Invoke the function.
+	f( schema_a,
+	   schema_b,
+	   m,
+	   n,
+	   k,
+	   buf_alpha,
+	   buf_a, cs_a, is_a,
+	          pd_a, ps_a,
+	   buf_b, rs_b, is_b,
+	          pd_b, ps_b,
+	   buf_beta,
+	   buf_c, rs_c, cs_c,
+	   cntx,
+	   rntm,
+	   thread );
+}
+
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, varname ) \
+\
+void PASTECH2(blx_,ch,varname) \
+     ( \
+       pack_t  schema_a, \
+       pack_t  schema_b, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       void*   alpha, \
+       void*   a, inc_t cs_a, inc_t is_a, \
+                  dim_t pd_a, inc_t ps_a, \
+       void*   b, inc_t rs_b, inc_t is_b, \
+                  dim_t pd_b, inc_t ps_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
+     ) \
+{ \
+	const num_t     dt         = PASTEMAC(ch,type); \
+\
+	/* Alias some constants to simpler names. */ \
+	const dim_t     MR         = pd_a; \
+	const dim_t     NR         = pd_b; \
+	/*const dim_t     PACKMR     = cs_a;*/ \
+	/*const dim_t     PACKNR     = rs_b;*/ \
+\
+	/* Query the context for the micro-kernel address and cast it to its
+	   function pointer type. */ \
+	PASTECH(ch,gemm_ukr_ft) \
+	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+\
+	/* Temporary C buffer for edge cases. Note that the strides of this
+	   temporary buffer are set so that they match the storage of the
+	   original C matrix. For example, if C is column-stored, ct will be
+	   column-stored as well. */ \
+	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const bool_t    col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
+\
+	ctype* restrict zero       = PASTEMAC(ch,0); \
+	ctype* restrict a_cast     = a; \
+	ctype* restrict b_cast     = b; \
+	ctype* restrict c_cast     = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
+	ctype* restrict b1; \
+	ctype* restrict c1; \
+\
+	dim_t           m_iter, m_left; \
+	dim_t           n_iter, n_left; \
+	dim_t           i, j; \
+	dim_t           m_cur; \
+	dim_t           n_cur; \
+	inc_t           rstep_a; \
+	inc_t           cstep_b; \
+	inc_t           rstep_c, cstep_c; \
+	auxinfo_t       aux; \
+\
+	/*
+	   Assumptions/assertions:
+	     rs_a == 1
+	     cs_a == PACKMR
+	     pd_a == MR
+	     ps_a == stride to next micro-panel of A
+	     rs_b == PACKNR
+	     cs_b == 1
+	     pd_b == NR
+	     ps_b == stride to next micro-panel of B
+	     rs_c == (no assumptions)
+	     cs_c == (no assumptions)
+	*/ \
+\
+	/* If any dimension is zero, return immediately. */ \
+	if ( bli_zero_dim3( m, n, k ) ) return; \
+\
+	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
+	PASTEMAC(ch,set0s_mxn)( MR, NR, \
+	                        ct, rs_ct, cs_ct ); \
+\
+	/* Compute number of primary and leftover components of the m and n
+	   dimensions. */ \
+	n_iter = n / NR; \
+	n_left = n % NR; \
+\
+	m_iter = m / MR; \
+	m_left = m % MR; \
+\
+	if ( n_left ) ++n_iter; \
+	if ( m_left ) ++m_iter; \
+\
+	/* Determine some increments used to step through A, B, and C. */ \
+	rstep_a = ps_a; \
+\
+	cstep_b = ps_b; \
+\
+	rstep_c = rs_c * MR; \
+	cstep_c = cs_c * NR; \
+\
+	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_schema_a( schema_a, &aux ); \
+	bli_auxinfo_set_schema_b( schema_b, &aux ); \
+\
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
+	bli_auxinfo_set_is_a( is_a, &aux ); \
+	bli_auxinfo_set_is_b( is_b, &aux ); \
+\
+	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
+	   loop around the microkernel. Here we query the thrinfo_t node for the
+	   1st (ir) loop around the microkernel. */ \
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
+\
+	/* Query the number of threads and thread ids for each loop. */ \
+	dim_t jr_nt  = bli_thread_n_way( thread ); \
+	dim_t jr_tid = bli_thread_work_id( thread ); \
+	dim_t ir_nt  = bli_thread_n_way( caucus ); \
+	dim_t ir_tid = bli_thread_work_id( caucus ); \
+\
+	dim_t jr_start, jr_end; \
+	dim_t ir_start, ir_end; \
+	dim_t jr_inc,   ir_inc; \
+\
+	/* Determine the thread range and increment for each thrinfo_t node. */ \
+	bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
+	bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
+\
+	/* Loop over the n dimension (NR columns at a time). */ \
+	for ( j = jr_start; j < jr_end; j += jr_inc ) \
+	{ \
+		ctype* restrict a1; \
+		ctype* restrict c11; \
+		ctype* restrict b2; \
+\
+		b1 = b_cast + j * cstep_b; \
+		c1 = c_cast + j * cstep_c; \
+\
+		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
+\
+		/* Initialize our next panel of B to be the current panel of B. */ \
+		b2 = b1; \
+\
+		/* Loop over the m dimension (MR rows at a time). */ \
+		for ( i = ir_start; i < ir_end; i += ir_inc ) \
+		{ \
+			ctype* restrict a2; \
+\
+			a1  = a_cast + i * rstep_a; \
+			c11 = c1     + i * rstep_c; \
+\
+			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
+\
+			/* Compute the addresses of the next panels of A and B. */ \
+			a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
+			if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
+			{ \
+				a2 = a_cast; \
+				b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
+				if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
+					b2 = b_cast; \
+			} \
+\
+			/* Save addresses of next panels of A and B to the auxinfo_t
+			   object. */ \
+			bli_auxinfo_set_next_a( a2, &aux ); \
+			bli_auxinfo_set_next_b( b2, &aux ); \
+\
+			/* Handle interior and edge cases separately. */ \
+			if ( m_cur == MR && n_cur == NR ) \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  beta_cast, \
+				  c11, rs_c, cs_c, \
+				  &aux, \
+				  cntx  \
+				); \
+			} \
+			else \
+			{ \
+				/* Invoke the gemm micro-kernel. */ \
+				gemm_ukr \
+				( \
+				  k, \
+				  alpha_cast, \
+				  a1, \
+				  b1, \
+				  zero, \
+				  ct, rs_ct, cs_ct, \
+				  &aux, \
+				  cntx  \
+				); \
+\
+				/* Scale the bottom edge of C and add the result from above. */ \
+				PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
+				                        ct,  rs_ct, cs_ct, \
+				                        beta_cast, \
+				                        c11, rs_c,  cs_c ); \
+			} \
+		} \
+	} \
+\
+/*
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
+PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
+*/ \
+}
+
+#if 0
+GENTFUNC( float,    s, gemm_ker_var2sl )
+GENTFUNC( double,   d, gemm_ker_var2sl )
+GENTFUNC( scomplex, c, gemm_ker_var2sl )
+GENTFUNC( dcomplex, z, gemm_ker_var2sl )
+#else
+INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
+#endif
+
--- a/sandbox/ref99/vars/blx_gemm_var.h
+++ b/sandbox/ref99/vars/blx_gemm_var.h
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -57,7 +58,8 @@ GENPROT( gemm_blk_var3 )
 GENPROT( gemm_packa )
 GENPROT( gemm_packb )

-GENPROT( gemm_ker_var2 )
+GENPROT( gemm_ker_var2sl )
+GENPROT( gemm_ker_var2rr )

 //
 // Prototype BLAS-like interfaces with void pointer operands.
@@ -85,5 +87,6 @@ void PASTECH2(blx_,ch,varname) \
       thrinfo_t* thread  \
     );

-INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
+INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )