Reorganized thread auto-factorization logic.

Details: - Reorganized logic of bli_thread_partition_2x2() so that the primary guts were factored out into "fast" and "slow" variants. Then added logic to the "fast" variant that allows for more optimal thread factorizations in some situations where there is at least one factor of 2. - Changed BLIS_THREAD_RATIO_M from 2 to 1 in bli_kernel_macro_defs.h and added comments to that file describing BLIS_THREAD_RATIO_? and BLIS_THREAD_MAX_?R. - In bli_family_zen.h and bli_family_zen2.h, preprocessed out several macros not used in vanilla BLIS and removed the unused macro BLIS_ENABLE_ZEN_BLOCK_SIZES from the former file. - Disabled AMD's small matrix handling entry points in bli_syrk_front.c and bli_trsm_front.c. (These branches of small matrix handling have not been reviewed by vanilla BLIS developers.) - Added commented-out calls printf() to bli_rntm.c. - Whitespace changes to bli_thread.c.
2026-04-20 07:38:53 +00:00 · 2020-12-01 19:51:27 +00:00
parent 64856ea5a6
commit 11dfc176a3
8 changed files with 416 additions and 105 deletions
--- a/config/zen/bli_family_zen.h
+++ b/config/zen/bli_family_zen.h
@@ -33,20 +33,18 @@

 */

-//#ifndef BLIS_FAMILY_H
-//#define BLIS_FAMILY_H
-
 // By default, it is effective to parallelize the outer loops.
 // Setting these macros to 1 will force JR and IR inner loops
 // to be not paralleized.
 #define BLIS_THREAD_MAX_IR      1
 #define BLIS_THREAD_MAX_JR      1

-#define BLIS_ENABLE_ZEN_BLOCK_SIZES
+
+// Vanilla BLIS disables AMD's small matrix handling by default.
+#if 0
 #define BLIS_ENABLE_SMALL_MATRIX
 #define BLIS_ENABLE_SMALL_MATRIX_TRSM

-
 // This will select the threshold below which small matrix code will be called.
 #define BLIS_SMALL_MATRIX_THRES        700
 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160
@@ -64,6 +62,8 @@
 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90

 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
+#endif
+

 #if 0
 // Allow the sup implementation to combine some small edge case iterations in
@@ -76,6 +76,3 @@
 #define BLIS_ENABLE_SUP_NR_EXT 0
 #endif

-
-//#endif
-
--- a/config/zen2/bli_family_zen2.h
+++ b/config/zen2/bli_family_zen2.h
@@ -33,20 +33,17 @@

 */

-#ifndef BLI_FAMILY_ZEN2_
-#define BLI_FAMILY_ZEN2_
-
 // By default, it is effective to parallelize the outer loops.
 // Setting these macros to 1 will force JR and IR inner loops
 // to be not paralleized.
 #define BLIS_THREAD_MAX_IR      1
 #define BLIS_THREAD_MAX_JR      1

-
+// Vanilla BLIS disables AMD's small matrix handling by default.
+#if 0
 #define BLIS_ENABLE_SMALL_MATRIX
 #define BLIS_ENABLE_SMALL_MATRIX_TRSM

-
 // This will select the threshold below which small matrix code will be called.
 #define BLIS_SMALL_MATRIX_THRES        700
 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160
@@ -85,6 +82,5 @@
 // When running HPL with pure MPI without DGEMM threading (Single-threaded
 // BLIS), defining this macro as 1 yields better performance.
 #define AOCL_BLIS_MULTIINSTANCE   0
-
 #endif

--- a/frame/3/syrk/bli_syrk_front.c
+++ b/frame/3/syrk/bli_syrk_front.c
@@ -61,10 +61,12 @@ void bli_syrk_front
 	bli_obj_alias_to( a, &at_local );
 	bli_obj_induce_trans( &at_local );

+#if 0
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 	gint_t status = bli_syrk_small( alpha, &a_local, &at_local, beta, &c_local,
 	                                cntx, cntl );
 	if ( status == BLIS_SUCCESS ) return;
+#endif
 #endif

 	// Check parameters.
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -52,9 +52,11 @@ void bli_trsm_front
 	obj_t   b_local;
 	obj_t   c_local;

+#if 0
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
 	gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
 	if ( status == BLIS_SUCCESS ) return;
+#endif
 #endif

 	// Check parameters.
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -236,9 +236,13 @@ void bli_rntm_set_ways_from_rntm

 		pc = 1;

+		//printf( "m n = %d %d  BLIS_THREAD_RATIO_M _N = %d %d\n", (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, (int)BLIS_THREAD_RATIO_N );
+
 		bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
 		                              n*BLIS_THREAD_RATIO_N, &ic, &jc );

+		//printf( "jc ic = %d %d\n", (int)jc, (int)ic );
+
 		for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
 		{
 			if ( ic % ir == 0 ) { ic /= ir; break; }
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -40,14 +40,21 @@

 // -- Conventional (large code path) values --

+// These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n
+// dimensions for the purposes of factorizing the total number of threads into
+// ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these
+// macros are used.
 #ifndef BLIS_THREAD_RATIO_M
-#define BLIS_THREAD_RATIO_M     2
+#define BLIS_THREAD_RATIO_M     1
 #endif

 #ifndef BLIS_THREAD_RATIO_N
 #define BLIS_THREAD_RATIO_N     1
 #endif

+// These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of
+// parallelism allowed when performing automatic factorization. See bli_rntm.c
+// to see how these macros are used.
 #ifndef BLIS_THREAD_MAX_IR
 #define BLIS_THREAD_MAX_IR      1
 #endif
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -108,7 +108,7 @@ void bli_thread_range_sub
 	//         13     >0    f        1    3      4     3     3     3+
 	//         14     >0    f        2    2      4     4     3     3+
 	//         15     >0    f        3    1      4     4     4     3+
-	//         15     =0    f        3    1      4     4     4     3 
+	//         15     =0    f        3    1      4     4     4     3
 	//
 	//         12     =0    t        4    0      3     3     3     3
 	//         12     >0    t        4    0      3+    3     3     3
@@ -968,76 +968,76 @@ siz_t bli_thread_range_weighted_b2t

 void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors )
 {
-    factors->n = n;
-    factors->sqrt_n = ( dim_t )sqrt( ( double )n );
-    factors->f = 2;
+	factors->n = n;
+	factors->sqrt_n = ( dim_t )sqrt( ( double )n );
+	factors->f = 2;
 }

 dim_t bli_next_prime_factor( bli_prime_factors_t* factors )
 {
-    // Return the prime factorization of the original number n one-by-one.
-    // Return 1 after all factors have been exhausted.
+	// Return the prime factorization of the original number n one-by-one.
+	// Return 1 after all factors have been exhausted.

-    // Looping over possible factors in increasing order assures we will
-    // only return prime factors (a la the Sieve of Eratosthenes).
-    while ( factors->f <= factors->sqrt_n )
-    {
-        // Special cases for factors 2-7 handle all numbers not divisible by 11
-        // or another larger prime. The slower loop version is used after that.
-        // If you use a number of threads with large prime factors you get
-        // what you deserve.
-        if ( factors->f == 2 )
-        {
-            if ( factors->n % 2 == 0 )
-            {
-                factors->n /= 2;
-                return 2;
-            }
-            factors->f = 3;
-        }
-        else if ( factors->f == 3 )
-        {
-            if ( factors->n % 3 == 0 )
-            {
-                factors->n /= 3;
-                return 3;
-            }
-            factors->f = 5;
-        }
-        else if ( factors->f == 5 )
-        {
-            if ( factors->n % 5 == 0 )
-            {
-                factors->n /= 5;
-                return 5;
-            }
-            factors->f = 7;
-        }
-        else if ( factors->f == 7 )
-        {
-            if ( factors->n % 7 == 0 )
-            {
-                factors->n /= 7;
-                return 7;
-            }
-            factors->f = 11;
-        }
-        else
-        {
-            if ( factors->n % factors->f == 0 )
-            {
-                factors->n /= factors->f;
-                return factors->f;
-            }
-            factors->f++;
-        }
-    }
+	// Looping over possible factors in increasing order assures we will
+	// only return prime factors (a la the Sieve of Eratosthenes).
+	while ( factors->f <= factors->sqrt_n )
+	{
+		// Special cases for factors 2-7 handle all numbers not divisible by 11
+		// or another larger prime. The slower loop version is used after that.
+		// If you use a number of threads with large prime factors you get
+		// what you deserve.
+		if ( factors->f == 2 )
+		{
+			if ( factors->n % 2 == 0 )
+			{
+				factors->n /= 2;
+				return 2;
+			}
+			factors->f = 3;
+		}
+		else if ( factors->f == 3 )
+		{
+			if ( factors->n % 3 == 0 )
+			{
+				factors->n /= 3;
+				return 3;
+			}
+			factors->f = 5;
+		}
+		else if ( factors->f == 5 )
+		{
+			if ( factors->n % 5 == 0 )
+			{
+				factors->n /= 5;
+				return 5;
+			}
+			factors->f = 7;
+		}
+		else if ( factors->f == 7 )
+		{
+			if ( factors->n % 7 == 0 )
+			{
+				factors->n /= 7;
+				return 7;
+			}
+			factors->f = 11;
+		}
+		else
+		{
+			if ( factors->n % factors->f == 0 )
+			{
+				factors->n /= factors->f;
+				return factors->f;
+			}
+			factors->f++;
+		}
+	}

-    // To get here we must be out of prime factors, leaving only n (if it is
-    // prime) or an endless string of 1s.
-    dim_t tmp = factors->n;
-    factors->n = 1;
-    return tmp;
+	// To get here we must be out of prime factors, leaving only n (if it is
+	// prime) or an endless string of 1s.
+	dim_t tmp = factors->n;
+	factors->n = 1;
+	return tmp;
 }

 bool bli_is_prime( dim_t n )
@@ -1052,10 +1052,6 @@ bool bli_is_prime( dim_t n )
 	else          return FALSE;
 }

-#if 0
-#include "limits.h"
-#endif
-
 void bli_thread_partition_2x2
     (
       dim_t           n_thread,
@@ -1065,6 +1061,240 @@ void bli_thread_partition_2x2
       dim_t* restrict nt2
     )
 {
+	// Partition a number of threads into two factors nt1 and nt2 such that
+	// nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a
+	// slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|).
+
+	// Return early small prime numbers of threads.
+	if ( n_thread < 4 )
+	{
+		*nt1 = ( work1 >= work2 ? n_thread : 1 );
+		*nt2 = ( work1 <  work2 ? n_thread : 1 );
+
+		return;
+	}
+
+#if 1
+	bli_thread_partition_2x2_fast( n_thread, work1, work2, nt1, nt2 );
+#else
+	bli_thread_partition_2x2_slow( n_thread, work1, work2, nt1, nt2 );
+#endif
+}
+
+//#define PRINT_FACTORS
+
+void bli_thread_partition_2x2_fast
+     (
+       dim_t           n_thread,
+       dim_t           work1,
+       dim_t           work2,
+       dim_t* restrict nt1,
+       dim_t* restrict nt2
+     )
+{
+	// Compute with these local variables until the end of the function, at
+	// which time we will save the values back to nt1 and nt2.
+	dim_t tn1 = 1;
+	dim_t tn2 = 1;
+
+	// Both algorithms need the prime factorization of n_thread.
+	bli_prime_factors_t factors;
+	bli_prime_factorization( n_thread, &factors );
+
+	// Fast algorithm: assign prime factors in increasing order to whichever
+	// partition has more work to do. The work is divided by the number of
+	// threads assigned at each iteration. This algorithm is sub-optimal in
+	// some cases. We attempt to mitigate the cases that involve at least one
+	// factor of 2. For example, in the partitioning of 12 with equal work
+	// this algorithm tentatively finds 6x2. This factorization involves a
+	// factor of 2 that can be reallocated, allowing us to convert it to the
+	// optimal solution of 4x3. But some cases cannot be corrected this way
+	// because they do not contain a factor of 2. For example, this algorithm
+	// factors 105 (with equal work) into 21x5 whereas 7x15 would be optimal.
+
+	#ifdef PRINT_FACTORS
+	printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 );
+	#endif
+
+	dim_t f;
+	while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
+	{
+		#ifdef PRINT_FACTORS
+		printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n",
+		        (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f );
+		#endif
+
+		if ( work1 > work2 ) { work1 /= f; tn1 *= f; }
+		else                 { work2 /= f; tn2 *= f; }
+	}
+
+	#ifdef PRINT_FACTORS
+	printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n",
+	        (int)work1, (int)work2, (int)tn1, (int)tn2 );
+	#endif
+
+	// Sometimes the last factor applied is prime. For example, on a square
+	// matrix, we tentatively arrive (from the logic above) at:
+	// - a 2x6 factorization when given 12 ways of parallelism
+	// - a 2x10 factorization when given 20 ways of parallelism
+	// - a 2x14 factorization when given 28 ways of parallelism
+	// These factorizations are suboptimal under the assumption that we want
+	// the parallelism to be as balanced as possible. Below, we make a final
+	// attempt at rebalancing nt1 and nt2 by checking to see if the gap between
+	// work1 and work2 is narrower if we reallocate a factor of 2.
+	if ( work1 > work2 )
+	{
+		// Example: nt = 12
+		//          w1 w2 (initial)   = 3600 3600; nt1 nt2 =  1 1
+		//          w1 w2 (tentative) = 1800  600; nt1 nt2 =  2 6
+		//          w1 w2 (ideal)     =  900 1200; nt1 nt2 =  4 3
+		if ( tn2 % 2 == 0 )
+		{
+			dim_t diff     =          work1   - work2;
+			dim_t diff_mod = bli_abs( work1/2 - work2*2 );
+
+			if ( diff_mod < diff ) { tn1 *= 2; tn2 /= 2; }
+		}
+	}
+	else if ( work1 < work2 )
+	{
+		// Example: nt = 40
+		//          w1 w2 (initial)   = 3600 3600; nt1 nt2 =  1 1
+		//          w1 w2 (tentative) =  360  900; nt1 nt2 = 10 4
+		//          w1 w2 (ideal)     =  720  450; nt1 nt2 =  5 8
+		if ( tn1 % 2 == 0 )
+		{
+			dim_t diff     =          work2   - work1;
+			dim_t diff_mod = bli_abs( work2/2 - work1*2 );
+
+			if ( diff_mod < diff ) { tn1 /= 2; tn2 *= 2; }
+		}
+	}
+
+	#ifdef PRINT_FACTORS
+	printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d (final)\n",
+	        (int)work1, (int)work2, (int)tn1, (int)tn2 );
+	#endif
+
+	// Save the final result.
+	*nt1 = tn1;
+	*nt2 = tn2;
+}
+
+#include "limits.h"
+
+void bli_thread_partition_2x2_slow
+     (
+       dim_t           n_thread,
+       dim_t           work1,
+       dim_t           work2,
+       dim_t* restrict nt1,
+       dim_t* restrict nt2
+     )
+{
+	// Slow algorithm: exhaustively constructs all factor pairs of n_thread and
+	// chooses the best one.
+
+	// Compute with these local variables until the end of the function, at
+	// which time we will save the values back to nt1 and nt2.
+	dim_t tn1 = 1;
+	dim_t tn2 = 1;
+
+	// Both algorithms need the prime factorization of n_thread.
+	bli_prime_factors_t factors;
+	bli_prime_factorization( n_thread, &factors );
+
+	// Eight prime factors handles n_thread up to 223092870.
+	dim_t fact[8];
+	dim_t mult[8];
+
+	// There is always at least one prime factor, so use if for initialization.
+	dim_t nfact = 1;
+	fact[0] = bli_next_prime_factor( &factors );
+	mult[0] = 1;
+
+	// Collect the remaining prime factors, accounting for multiplicity of
+	// repeated factors.
+	dim_t f;
+	while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
+	{
+		if ( f == fact[nfact-1] )
+		{
+			mult[nfact-1]++;
+		}
+		else
+		{
+			nfact++;
+			fact[nfact-1] = f;
+			mult[nfact-1] = 1;
+		}
+	}
+
+	// Now loop over all factor pairs. A single factor pair is denoted by how
+	// many of each prime factor are included in the first factor (ntaken).
+	dim_t ntake[8] = {0};
+	dim_t min_diff = INT_MAX;
+
+	// Loop over how many prime factors to assign to the first factor in the
+	// pair, for each prime factor. The total number of iterations is
+	// \Prod_{i=0}^{nfact-1} mult[i].
+	bool done = FALSE;
+	while ( !done )
+	{
+		dim_t x = 1;
+		dim_t y = 1;
+
+		// Form the factors by integer exponentiation and accumulation.
+		for ( dim_t i = 0 ; i < nfact ; i++ )
+		{
+			x *= bli_ipow( fact[i], ntake[i] );
+			y *= bli_ipow( fact[i], mult[i]-ntake[i] );
+		}
+
+		// Check if this factor pair is optimal by checking
+		// |nt1*work2 - nt2*work1|.
+		dim_t diff = llabs( x*work2 - y*work1 );
+		if ( diff < min_diff )
+		{
+			min_diff = diff;
+			tn1 = x;
+			tn2 = y;
+		}
+
+		// Go to the next factor pair by doing an "odometer loop".
+		for ( dim_t i = 0 ; i < nfact ; i++ )
+		{
+			if ( ++ntake[i] > mult[i] )
+			{
+				ntake[i] = 0;
+				if ( i == nfact-1 ) done = TRUE;
+				else continue;
+			}
+			break;
+		}
+	}
+
+	// Save the final result.
+	*nt1 = tn1;
+	*nt2 = tn2;
+}
+
+#if 0
+void bli_thread_partition_2x2_orig
+     (
+       dim_t           n_thread,
+       dim_t           work1,
+       dim_t           work2,
+       dim_t* restrict nt1,
+       dim_t* restrict nt2
+     )
+{
+	// Copy nt1 and nt2 to local variables and then compute with those local
+	// variables until the end of the function, at which time we will save the
+	// values back to nt1 and nt2.
+	dim_t tn1; // = *nt1;
+	dim_t tn2; // = *nt2;
+
    // Partition a number of threads into two factors nt1 and nt2 such that
    // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a
    // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|).
@@ -1072,43 +1302,94 @@ void bli_thread_partition_2x2
    // Return early small prime numbers of threads.
    if ( n_thread < 4 )
    {
-        *nt1 = ( work1 >= work2 ? n_thread : 1 );
-        *nt2 = ( work1 <  work2 ? n_thread : 1 );
+        tn1 = ( work1 >= work2 ? n_thread : 1 );
+        tn2 = ( work1 <  work2 ? n_thread : 1 );

 		return;
    }

-    *nt1 = 1;
-    *nt2 = 1;
+    tn1 = 1;
+    tn2 = 1;

    // Both algorithms need the prime factorization of n_thread.
    bli_prime_factors_t factors;
    bli_prime_factorization( n_thread, &factors );

-    #if 1
+#if 1

    // Fast algorithm: assign prime factors in increasing order to whichever
    // partition has more work to do. The work is divided by the number of
-    // threads assigned at each iteration. This algorithm is sub-optimal,
-    // for example in the partitioning of 12 with equal work (optimal solution
-    // is 4x3, this algorithm finds 6x2).
+    // threads assigned at each iteration. This algorithm is sub-optimal in
+	// some cases. We attempt to mitigate the cases that involve at least one
+	// factor of 2. For example, in the partitioning of 12 with equal work
+	// this algorithm tentatively finds 6x2. This factorization involves a
+	// factor of 2 that can be reallocated, allowing us to convert it to the
+	// optimal solution of 4x3. But some cases cannot be corrected this way
+	// because they do not contain a factor of 2. For example, this algorithm
+	// factors 105 (with equal work) into 21x5 whereas 7x15 would be optimal.
+
+	//printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 );

    dim_t f;
    while ( ( f = bli_next_prime_factor( &factors ) ) > 1 )
    {
+		//printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f );
+
        if ( work1 > work2 )
        {
            work1 /= f;
-            *nt1 *= f;
+            tn1 *= f;
        }
        else
        {
            work2 /= f;
-            *nt2 *= f;
+            tn2 *= f;
        }
    }

-    #else
+	//printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2 );
+
+	// Sometimes the last factor applied is prime. For example, on a square
+	// matrix, we tentatively arrive (from the logic above) at:
+	// - a 2x6 factorization when given 12 ways of parallelism
+	// - a 2x10 factorization when given 20 ways of parallelism
+	// - a 2x14 factorization when given 28 ways of parallelism
+	// These factorizations are suboptimal under the assumption that we want
+	// the parallelism to be as balanced as possible. Below, we make a final
+	// attempt at rebalancing nt1 and nt2 by checking to see if the gap between
+	// work1 and work2 is narrower if we reallocate a factor of 2.
+	if ( work1 > work2 )
+	{
+		// Example: nt = 12
+		//          w1 w2 (initial)   = 3600 3600; nt1 nt2 =  1 1
+		//          w1 w2 (tentative) = 1800  600; nt1 nt2 =  2 6
+		//          w1 w2 (ideal)     =  900 1200; nt1 nt2 =  4 3
+		if ( tn2 % 2 == 0 )
+		{
+			dim_t diff     =          work1   - work2;
+			dim_t diff_mod = bli_abs( work1/2 - work2*2 );
+
+			if ( diff_mod < diff ) { tn1 *= 2; tn2 /= 2; }
+		}
+	}
+	else if ( work1 < work2 )
+	{
+		// Example: nt = 40
+		//          w1 w2 (initial)   = 3600 3600; nt1 nt2 =  1 1
+		//          w1 w2 (tentative) =  360  900; nt1 nt2 = 10 4
+		//          w1 w2 (ideal)     =  720  450; nt1 nt2 =  5 8
+		if ( tn1 % 2 == 0 )
+		{
+			dim_t diff     =          work2   - work1;
+			dim_t diff_mod = bli_abs( work2/2 - work1*2 );
+
+			if ( diff_mod < diff ) { tn1 /= 2; tn2 *= 2; }
+		}
+	}
+
+	//printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d (final)\n", (int)work1, (int)work2, (int)tn1, (int)tn2 );
+
+#else

    // Slow algorithm: exhaustively constructs all factor pairs of n_thread and
    // chooses the best one.
@@ -1166,8 +1447,8 @@ void bli_thread_partition_2x2
        if ( diff < min_diff )
        {
            min_diff = diff;
-            *nt1 = x;
-            *nt2 = y;
+            tn1 = x;
+            tn2 = y;
        }

        // Go to the next factor pair by doing an "odometer loop".
@@ -1183,8 +1464,14 @@ void bli_thread_partition_2x2
        }
    }

-    #endif
+#endif
+
+
+	// Save the final result.
+	*nt1 = tn1;
+	*nt2 = tn2;
 }
+#endif

 // -----------------------------------------------------------------------------

@@ -1206,15 +1493,15 @@ dim_t bli_lcm( dim_t x, dim_t y)

 dim_t bli_ipow( dim_t base, dim_t power )
 {
-    dim_t p = 1;
+	dim_t p = 1;

-    for ( dim_t mask = 0x1 ; mask <= power ; mask <<= 1 )
-    {
-        if ( power & mask ) p *= base;
-        base *= base;
-    }
+	for ( dim_t mask = 0x1 ; mask <= power ; mask <<= 1 )
+	{
+		if ( power & mask ) p *= base;
+		base *= base;
+	}

-    return p;
+	return p;
 }

 // -----------------------------------------------------------------------------
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -174,6 +174,22 @@ void bli_thread_partition_2x2
       dim_t* restrict nt1,
       dim_t* restrict nt2
     );
+void bli_thread_partition_2x2_slow
+     (
+       dim_t           n_thread,
+       dim_t           work1,
+       dim_t           work2,
+       dim_t* restrict nt1,
+       dim_t* restrict nt2
+     );
+void bli_thread_partition_2x2_fast
+     (
+       dim_t           n_thread,
+       dim_t           work1,
+       dim_t           work2,
+       dim_t* restrict nt1,
+       dim_t* restrict nt2
+     );

 // -----------------------------------------------------------------------------