diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c
index f46915474..535cc4710 100644
--- a/frame/3/gemm/bli_gemm_blk_var1f.c
+++ b/frame/3/gemm/bli_gemm_blk_var1f.c
@@ -44,7 +44,7 @@ void bli_gemm_blk_var1f( obj_t*  a,
     obj_t b_pack_s;
     obj_t a1_pack_s, c1_pack_s;
 
-    obj_t a1, c1; 
+    obj_t a1, c1;
     obj_t* a1_pack  = NULL;
     obj_t* b_pack   = NULL;
     obj_t* c1_pack  = NULL;
@@ -83,9 +83,9 @@ void bli_gemm_blk_var1f( obj_t*  a,
 	// Query dimension in partitioning direction.
 	m_trans = bli_obj_length_after_trans( *a );
     dim_t start, end;
-    bli_get_range( thread, 0, m_trans, 
-                   bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
-                   &start, &end );
+    bli_get_range_t2b( thread, 0, m_trans,
+                       bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
+                       &start, &end );
 
 	// Partition along the m dimension.
 	for ( i = start; i < end; i += b_alg )
@@ -130,7 +130,7 @@ void bli_gemm_blk_var1f( obj_t*  a,
 		              c1_pack,
 		              cntl_sub_gemm( cntl ),
                       gemm_thread_sub_gemm( thread ) );
-        
+
         thread_ibarrier( thread );
 
 		// Unpack C1 (if C1 was packed).
diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c
index b13f21778..f122f9a71 100644
--- a/frame/3/gemm/bli_gemm_blk_var2f.c
+++ b/frame/3/gemm/bli_gemm_blk_var2f.c
@@ -42,7 +42,7 @@ void bli_gemm_blk_var2f( obj_t*  a,
 {
 	obj_t a_pack_s;
     obj_t b1_pack_s, c1_pack_s;
-    
+
     obj_t b1, c1;
     obj_t*  a_pack = NULL;
 	obj_t*  b1_pack = NULL;
@@ -82,9 +82,9 @@ void bli_gemm_blk_var2f( obj_t*  a,
 	// Query dimension in partitioning direction.
 	n_trans = bli_obj_width_after_trans( *b );
     dim_t start, end;
-    bli_get_range( thread, 0, n_trans, 
-                   bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
-                   &start, &end );
+    bli_get_range_l2r( thread, 0, n_trans,
+                       bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
+                       &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
@@ -129,7 +129,7 @@ void bli_gemm_blk_var2f( obj_t*  a,
 		              c1_pack,
 		              cntl_sub_gemm( cntl ),
                       gemm_thread_sub_gemm( thread ) );
-        
+
         thread_ibarrier( thread );
 
         // Unpack C1 (if C1 was packed).
diff --git a/frame/3/gemm/ind/bli_gemm_blk_var4f.c b/frame/3/gemm/ind/bli_gemm_blk_var4f.c
index 97065c634..d57c8c82f 100644
--- a/frame/3/gemm/ind/bli_gemm_blk_var4f.c
+++ b/frame/3/gemm/ind/bli_gemm_blk_var4f.c
@@ -52,7 +52,7 @@ void bli_gemm_blk_var4f( obj_t*  a,
     obj_t b_pack_s;
     obj_t a1_pack_s, c1_pack_s;
 
-    obj_t a1, c1; 
+    obj_t a1, c1;
     obj_t* a1_pack  = NULL;
     obj_t* b_pack   = NULL;
     obj_t* c1_pack  = NULL;
@@ -91,9 +91,9 @@ void bli_gemm_blk_var4f( obj_t*  a,
 	// Query dimension in partitioning direction.
 	m_trans = bli_obj_length_after_trans( *a );
     dim_t start, end;
-    bli_get_range( thread, 0, m_trans, 
-                   bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
-                   &start, &end );
+    bli_get_range_t2b( thread, 0, m_trans,
+                       bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
+                       &start, &end );
 
 	// Partition along the m dimension.
 	for ( i = start; i < end; i += b_alg )
@@ -140,7 +140,7 @@ void bli_gemm_blk_var4f( obj_t*  a,
 		              c1_pack,
 		              cntl_sub_gemm( cntl ),
                       gemm_thread_sub_gemm( thread ) );
-        
+
         thread_ibarrier( thread );
 
 		// Only apply beta within the first of three subproblems.
@@ -167,7 +167,7 @@ void bli_gemm_blk_var4f( obj_t*  a,
 		              c1_pack,
 		              cntl_sub_gemm( cntl ),
                       gemm_thread_sub_gemm( thread ) );
-        
+
         thread_ibarrier( thread );
 
 
@@ -191,7 +191,7 @@ void bli_gemm_blk_var4f( obj_t*  a,
 		              c1_pack,
 		              cntl_sub_gemm( cntl ),
                       gemm_thread_sub_gemm( thread ) );
-        
+
         thread_ibarrier( thread );
 
 
diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c
index fb87d3102..b982de34d 100644
--- a/frame/3/herk/bli_herk_blk_var1f.c
+++ b/frame/3/herk/bli_herk_blk_var1f.c
@@ -52,7 +52,7 @@ void bli_herk_blk_var1f( obj_t*  a,
 	dim_t b_alg;
 	dim_t m_trans;
 
-    if( thread_am_ochief( thread ) ) { 
+    if( thread_am_ochief( thread ) ) {
         // Initialize object for packing A'.
         bli_obj_init_pack( &ah_pack_s );
         bli_packm_init( ah, &ah_pack_s,
@@ -61,9 +61,9 @@ void bli_herk_blk_var1f( obj_t*  a,
         // Scale C by beta (if instructed).
         // Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
         bli_scalm_int( &BLIS_ONE,
-                       c,  
+                       c,
                        cntl_sub_scalm( cntl ) );
-    }   
+    }
     ah_pack = thread_obroadcast( thread, &ah_pack_s );
 
 	// Initialize pack objects that are passed into packm_init() for A and C.
@@ -82,9 +82,9 @@ void bli_herk_blk_var1f( obj_t*  a,
 	// Query dimension in partitioning direction.
 	m_trans = bli_obj_length_after_trans( *c );
     dim_t start, end;
-    bli_get_range_weighted( thread, 0, m_trans, 
-                            bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
-                            bli_obj_is_upper( *c ), &start, &end );
+    bli_get_range_weighted_t2b( thread, 0, m_trans,
+                                bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
+                                bli_obj_root_uplo( *c ), &start, &end );
 
 	// Partition along the m dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c
index 2b5f3f35f..b89842c40 100644
--- a/frame/3/herk/bli_herk_blk_var2f.c
+++ b/frame/3/herk/bli_herk_blk_var2f.c
@@ -90,9 +90,9 @@ void bli_herk_blk_var2f( obj_t*  a,
     dim_t start, end;
 
     // Needs to be replaced with a weighted range because triangle
-    bli_get_range_weighted( thread, 0, n_trans, 
-                            bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
-                            bli_obj_is_lower( *c ), &start, &end );
+    bli_get_range_weighted_l2r( thread, 0, n_trans,
+                                bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
+                                bli_obj_root_uplo( *c ), &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c
index eda24e3fa..8095e0d52 100644
--- a/frame/3/trmm/bli_trmm_blk_var1f.c
+++ b/frame/3/trmm/bli_trmm_blk_var1f.c
@@ -94,9 +94,9 @@ void bli_trmm_blk_var1f( obj_t*  a,
 		          bli_obj_width_after_trans( *a );
 
     dim_t start, end;
-    bli_get_range_weighted( thread, offA, m_trans, 
-                            bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
-                            bli_obj_is_upper( *c ), &start, &end );
+    bli_get_range_weighted_t2b( thread, offA, m_trans, 
+                                bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
+                                bli_obj_root_uplo( *a ), &start, &end );
 
 	// Partition along the m dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c
index 6758e1a32..33b79434b 100644
--- a/frame/3/trmm/bli_trmm_blk_var2b.c
+++ b/frame/3/trmm/bli_trmm_blk_var2b.c
@@ -82,9 +82,9 @@ void bli_trmm_blk_var2b( obj_t*  a,
 	// Query dimension in partitioning direction.
 	n_trans = bli_obj_width_after_trans( *b );
     dim_t start, end;
-    bli_get_range_weighted( thread, 0, n_trans, 
-                            bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
-                            bli_obj_is_upper( *c ), &start, &end );
+    bli_get_range_weighted_r2l( thread, 0, n_trans, 
+                                bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
+                                bli_obj_root_uplo( *b ), &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c
index 3b6d780df..92f1ec0d2 100644
--- a/frame/3/trmm/bli_trmm_blk_var2f.c
+++ b/frame/3/trmm/bli_trmm_blk_var2f.c
@@ -82,9 +82,9 @@ void bli_trmm_blk_var2f( obj_t*  a,
 	// Query dimension in partitioning direction.
 	n_trans = bli_obj_width_after_trans( *b );
     dim_t start, end;
-    bli_get_range_weighted( thread, 0, n_trans, 
-                            bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
-                            bli_obj_is_lower( *c ), &start, &end );
+    bli_get_range_weighted_l2r( thread, 0, n_trans, 
+                                bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
+                                bli_obj_root_uplo( *b ), &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index f48d1481a..9cb3f9091 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -129,8 +129,9 @@ void bli_trmm3_front( side_t  side,
 	bli_obj_set_as_root( b_local );
 	bli_obj_set_as_root( c_local );
 
-
-    trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
+	// Notice that, unlike trmm_r, there is no dependency in the jc loop
+	// for trmm3_r, so we can pass in FALSE for jc_dependency.
+	trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
     dim_t n_threads = thread_num_threads( infos[0] );
 
     // Invoke the internal back-end.
diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c
index ca79804a1..a79a3033a 100644
--- a/frame/3/trsm/bli_trsm_blk_var1b.c
+++ b/frame/3/trsm/bli_trsm_blk_var1b.c
@@ -83,10 +83,10 @@ void bli_trsm_blk_var1b( obj_t*  a,
 
     dim_t start, end;
     num_t dt = bli_obj_execution_datatype( *a );
-    bli_get_range( thread, offA, m_trans, 
-                   //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
-                   bli_info_get_default_mc( BLIS_TRSM, dt ),
-                   &start, &end );
+    bli_get_range_b2t( thread, offA, m_trans,
+                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+                       bli_info_get_default_mc( BLIS_TRSM, dt ),
+                       &start, &end );
 
 	// Partition along the remaining portion of the m dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c
index fe4ecb310..b24626332 100644
--- a/frame/3/trsm/bli_trsm_blk_var1f.c
+++ b/frame/3/trsm/bli_trsm_blk_var1f.c
@@ -82,10 +82,10 @@ void bli_trsm_blk_var1f( obj_t*  a,
 
     dim_t start, end;
     num_t dt = bli_obj_execution_datatype( *a );
-    bli_get_range( thread, offA, m_trans,
-                   //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),  
-                   bli_info_get_default_mc( BLIS_TRSM, dt ),
-                   &start, &end );
+    bli_get_range_t2b( thread, offA, m_trans,
+                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+                       bli_info_get_default_mc( BLIS_TRSM, dt ),
+                       &start, &end );
 
 	// Partition along the remaining portion of the m dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c
index 82b2ea4c8..b53c5884f 100644
--- a/frame/3/trsm/bli_trsm_blk_var2b.c
+++ b/frame/3/trsm/bli_trsm_blk_var2b.c
@@ -84,12 +84,12 @@ void bli_trsm_blk_var2b( obj_t*  a,
 	n_trans = bli_obj_width_after_trans( *b );
     dim_t start, end;
     num_t dt = bli_obj_execution_datatype( *a );
-    bli_get_range( thread, 0, n_trans,
-                   //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
-	               //         bli_info_get_default_mr( BLIS_TRSM, dt ) ), 
-	               bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
-	                        bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
-                   &start, &end );
+    bli_get_range_r2l( thread, 0, n_trans,
+                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
+	                   //         bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+	                   bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
+	                            bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
+                       &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c
index 05833d2da..a770ec59d 100644
--- a/frame/3/trsm/bli_trsm_blk_var2f.c
+++ b/frame/3/trsm/bli_trsm_blk_var2f.c
@@ -84,12 +84,12 @@ void bli_trsm_blk_var2f( obj_t*  a,
 	n_trans = bli_obj_width_after_trans( *b );
     dim_t start, end;
     num_t dt = bli_obj_execution_datatype( *a );
-    bli_get_range( thread, 0, n_trans, 
-                   //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
-	               //         bli_info_get_default_mr( BLIS_TRSM, dt ) ),
-	               bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
-	                        bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
-                   &start, &end );
+    bli_get_range_l2r( thread, 0, n_trans,
+                       //bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
+	                   //         bli_info_get_default_mr( BLIS_TRSM, dt ) ),
+	                   bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
+	                            bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
+                       &start, &end );
 
 	// Partition along the n dimension.
 	for ( i = start; i < end; i += b_alg )
diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c
index bb4c2eece..63b89a484 100644
--- a/frame/3/trsm/bli_trsm_blk_var3b.c
+++ b/frame/3/trsm/bli_trsm_blk_var3b.c
@@ -130,10 +130,10 @@ void bli_trsm_blk_var3b( obj_t*  a,
 		// internal alpha scalars on A/B and C are non-zero, we must ensure
 		// that they are only used in the first iteration.
         thread_ibarrier( thread );
-		if ( i == 0 && thread_am_ichief( thread ) ) { 
+		if ( i == 0 && thread_am_ichief( thread ) ) {
             bli_obj_scalar_reset( a );
             bli_obj_scalar_reset( b );
-            bli_obj_scalar_reset( c_pack ); 
+            bli_obj_scalar_reset( c_pack );
         }
 	}
 
diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c
index e6c43387b..afa9c5259 100644
--- a/frame/3/trsm/bli_trsm_blk_var3f.c
+++ b/frame/3/trsm/bli_trsm_blk_var3f.c
@@ -130,10 +130,10 @@ void bli_trsm_blk_var3f( obj_t*  a,
 		// internal alpha scalars on A/B and C are non-zero, we must ensure
 		// that they are only used in the first iteration.
         thread_ibarrier( thread );
-		if ( i == 0 && thread_am_ichief( thread ) ) { 
+		if ( i == 0 && thread_am_ichief( thread ) ) {
             bli_obj_scalar_reset( a );
             bli_obj_scalar_reset( b );
-            bli_obj_scalar_reset( c_pack ); 
+            bli_obj_scalar_reset( c_pack );
         }
 	}
 
diff --git a/frame/base/bli_threading.c b/frame/base/bli_threading.c
index f6f0a241c..b0a882325 100644
--- a/frame/base/bli_threading.c
+++ b/frame/base/bli_threading.c
@@ -142,65 +142,319 @@ void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_s
 }
 
 // Code for work assignments
-void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
 {
-    thrinfo_t* thread = (thrinfo_t*) thr;
-    dim_t n_way = thread->n_way;
-    dim_t work_id = thread->work_id;
+	thrinfo_t* thread     = ( thrinfo_t* )thr;
+	dim_t      n_way      = thread->n_way;
+	dim_t      work_id    = thread->work_id;
 
-    dim_t size = all_end - all_start;
-    dim_t n_pt = size / n_way;
-    n_pt = (n_pt * n_way < size) ? n_pt + 1 : n_pt;
-    n_pt = (n_pt % block_factor == 0) ? n_pt : n_pt + block_factor - (n_pt % block_factor); 
-    *start = work_id * n_pt + all_start;
-    *end   = bli_min( *start + n_pt, size + all_start );
+	dim_t      size       = all_end - all_start;
+
+	dim_t      n_bf_whole = size / block_factor;
+	dim_t      n_bf_left  = size % block_factor;
+
+	dim_t      n_bf_lo    = n_bf_whole / n_way;
+	dim_t      n_bf_hi    = n_bf_whole / n_way;
+
+	// In this function, we partition the space between all_start and
+	// all_end into n_way partitions, each a multiple of block_factor
+	// with the exception of the one partition that recieves the
+	// "edge" case (if applicable).
+	//
+	// Here are examples of various thread partitionings, in units of
+	// the block_factor, when n_way = 4. (A '+' indicates the thread
+	// that receives the leftover edge case (ie: n_bf_left extra
+	// rows/columns in its sub-range).
+	//                                        (all_start ... all_end)
+	// n_bf_whole  _left  hel  n_th_lo  _hi   thr0  thr1  thr2  thr3
+	//         12     =0    f        0    4      3     3     3     3
+	//         12     >0    f        0    4      3     3     3     3+
+	//         13     >0    f        1    3      4     3     3     3+
+	//         14     >0    f        2    2      4     4     3     3+
+	//         15     >0    f        3    1      4     4     4     3+
+	//         15     =0    f        3    1      4     4     4     3 
+	//
+	//         12     =0    t        4    0      3     3     3     3
+	//         12     >0    t        4    0      3+    3     3     3
+	//         13     >0    t        3    1      3+    3     3     4
+	//         14     >0    t        2    2      3+    3     4     4
+	//         15     >0    t        1    3      3+    4     4     4
+	//         15     =0    t        1    3      3     4     4     4
+
+	// As indicated by the table above, load is balanced as equally
+	// as possible, even in the presence of an edge case.
+
+	// First, we must differentiate between cases where the leftover
+	// "edge" case (n_bf_left) should be allocated to a thread partition
+	// at the low end of the index range or the high end.
+
+	if ( handle_edge_low == FALSE )
+	{
+		// Notice that if all threads receive the same number of
+		// block_factors, those threads are considered "high" and
+		// the "low" thread group is empty.
+		dim_t n_th_lo = n_bf_whole % n_way;
+		//dim_t n_th_hi = n_way - n_th_lo;
+
+		// If some partitions must have more block_factors than others
+		// assign the slightly larger partitions to lower index threads.
+		if ( n_th_lo != 0 ) n_bf_lo += 1;
+
+		// Compute the actual widths (in units of rows/columns) of
+		// individual threads in the low and high groups.
+		dim_t size_lo = n_bf_lo * block_factor;
+		dim_t size_hi = n_bf_hi * block_factor;
+
+		// Precompute the starting indices of the low and high groups.
+		dim_t lo_start = all_start;
+		dim_t hi_start = all_start + n_th_lo * size_lo;
+
+		// Compute the start and end of individual threads' ranges
+		// as a function of their work_ids and also the group to which
+		// they belong (low or high).
+		if ( work_id < n_th_lo )
+		{
+			*start = lo_start + (work_id  ) * size_lo;
+			*end   = lo_start + (work_id+1) * size_lo;
+		}
+		else // if ( n_th_lo <= work_id )
+		{
+			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
+			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
+
+			// Since the edge case is being allocated to the high
+			// end of the index range, we have to advance the last
+			// thread's end.
+			if ( work_id == n_way - 1 ) *end += n_bf_left;
+		}
+	}
+	else // if ( handle_edge_low == TRUE )
+	{
+		// Notice that if all threads receive the same number of
+		// block_factors, those threads are considered "low" and
+		// the "high" thread group is empty.
+		dim_t n_th_hi = n_bf_whole % n_way;
+		dim_t n_th_lo = n_way - n_th_hi;
+
+		// If some partitions must have more block_factors than others
+		// assign the slightly larger partitions to higher index threads.
+		if ( n_th_hi != 0 ) n_bf_hi += 1;
+
+		// Compute the actual widths (in units of rows/columns) of
+		// individual threads in the low and high groups.
+		dim_t size_lo = n_bf_lo * block_factor;
+		dim_t size_hi = n_bf_hi * block_factor;
+
+		// Precompute the starting indices of the low and high groups.
+		dim_t lo_start = all_start;
+		dim_t hi_start = all_start + n_th_lo * size_lo
+		                           + n_bf_left;
+
+		// Compute the start and end of individual threads' ranges
+		// as a function of their work_ids and also the group to which
+		// they belong (low or high).
+		if ( work_id < n_th_lo )
+		{
+			*start = lo_start + (work_id  ) * size_lo;
+			*end   = lo_start + (work_id+1) * size_lo;
+
+			// Since the edge case is being allocated to the low
+			// end of the index range, we have to advance the
+			// starts/ends accordingly.
+			if ( work_id == 0 )   *end   += n_bf_left;
+			else                { *start += n_bf_left;
+			                      *end   += n_bf_left; }
+		}
+		else // if ( n_th_lo <= work_id )
+		{
+			*start = hi_start + (work_id-n_th_lo  ) * size_hi;
+			*end   = hi_start + (work_id-n_th_lo+1) * size_hi;
+		}
+	}
 }
 
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end)
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
 {
-    thrinfo_t* thread = (thrinfo_t*) thr;
-    dim_t n_way = thread->n_way;
-    dim_t work_id = thread->work_id;
-    dim_t size = all_end - all_start;
+	bli_get_range( thr, all_start, all_end, block_factor,
+	               FALSE, start, end );
+}
 
-    *start = 0;
-    *end   = all_end - all_start;
-    double num = size*size / (double) n_way;
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+	bli_get_range( thr, all_start, all_end, block_factor,
+	               TRUE, start, end );
+}
 
-    if( forward ) {
-        dim_t curr_caucus = n_way - 1;
-        dim_t len = 0;
-        while(1){
-            dim_t width = ceil(sqrt( len*len + num )) - len; // The width of the current caucus
-            width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
-            if( curr_caucus == work_id ) {
-                *start = bli_max( 0 , *end - width ) + all_start;
-                *end = *end + all_start;
-                return;
-            }
-            else{
-                *end -= width;
-                len += width;
-                curr_caucus--;
-            }
-        }
-    }
-    else{
-        while(1){
-            dim_t width = ceil(sqrt(*start * *start + num)) - *start;
-            width = (width % block_factor == 0) ? width : width + block_factor - (width % block_factor);
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+	bli_get_range( thr, all_start, all_end, block_factor,
+	               FALSE, start, end );
+}
 
-            if( work_id == 0 ) {
-                *start = *start + all_start;
-                *end = bli_min( *start + width, all_end );
-                return;
-            }
-            else{
-                *start = *start + width;
-            }
-            work_id--;
-        }
-    }
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
+{
+	bli_get_range( thr, all_start, all_end, block_factor,
+	               TRUE, start, end );
+}
+
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
+{
+	thrinfo_t* thread  = ( thrinfo_t* )thr;
+	dim_t      n_way   = thread->n_way;
+	dim_t      work_id = thread->work_id;
+	dim_t      size    = all_end - all_start;
+	dim_t      width;
+	dim_t      block_fac_leftover = size % block_factor;
+	dim_t      i;
+	double     num;
+
+	*start = 0;
+	*end   = all_end - all_start;
+	num    = size * size / ( double )n_way;
+
+	if ( bli_is_lower( uplo ) )
+	{
+		dim_t cur_caucus = n_way - 1;
+		dim_t len        = 0;
+
+		// This loop computes subpartitions backwards, from the high end
+		// of the index range to the low end. If the low end is assumed
+		// to be on the left and the high end the right, this assignment
+		// of widths is appropriate for n dimension partitioning of a
+		// lower triangular matrix.
+		for ( i = 0; TRUE; ++i )
+		{
+			width = ceil( sqrt( len*len + num ) ) - len;
+
+			// If we need to allocate the edge case (assuming it exists)
+			// to the high thread subpartition, adjust width so that it
+			// contains the exact amount of leftover edge dimension so that
+			// all remaining subpartitions can be multiples of block_factor.
+			// If the edge case is to be allocated to the low subpartition,
+			// or if there is no edge case, it is implicitly allocated to
+			// the low subpartition by virtue of the fact that all other
+			// subpartitions already assigned will be multiples of
+			// block_factor.
+			if ( i == 0 && !handle_edge_low )
+			{
+				if ( width % block_factor != block_fac_leftover )
+					width += block_fac_leftover - ( width % block_factor );
+			}
+			else
+			{
+				if ( width % block_factor != 0 )
+					width += block_factor - ( width % block_factor );
+			}
+
+			if ( cur_caucus == work_id )
+			{
+				*start = bli_max( 0, *end - width ) + all_start;
+				*end   = *end + all_start;
+				return;
+			}
+			else
+			{
+				*end -= width;
+				len  += width;
+				cur_caucus--;
+			}
+		}
+	}
+	else // if ( bli_is_upper( uplo ) )
+	{
+		// This loop computes subpartitions forwards, from the low end
+		// of the index range to the high end. If the low end is assumed
+		// to be on the left and the high end the right, this assignment
+		// of widths is appropriate for n dimension partitioning of an
+		// upper triangular matrix.
+		for ( i = 0; TRUE; ++i )
+		{
+			width = ceil( sqrt( *start * *start + num ) ) - *start;
+
+			if ( i == 0 && handle_edge_low )
+			{
+				if ( width % block_factor != block_fac_leftover )
+					width += block_fac_leftover - ( width % block_factor );
+			}
+			else
+			{
+				if ( width % block_factor != 0 )
+					width += block_factor - ( width % block_factor );
+			}
+
+			if ( work_id == 0 )
+			{
+				*start = *start + all_start;
+				*end = bli_min( *start + width, all_end );
+				return;
+			}
+			else
+			{
+				*start = *start + width;
+				work_id--;
+			}
+		}
+	}
+}
+
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+	if ( bli_is_upper_or_lower( uplo ) )
+	{
+		bli_get_range_weighted( thr, all_start, all_end, block_factor,
+		                        uplo, FALSE, start, end );
+	}
+	else // if dense or zeros
+	{
+		bli_get_range_l2r( thr, all_start, all_end, block_factor,
+		                   start, end );
+	}
+}
+
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+	if ( bli_is_upper_or_lower( uplo ) )
+	{
+//printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
+		bli_toggle_uplo( uplo );
+		bli_get_range_weighted( thr, all_start, all_end, block_factor,
+		                        uplo, TRUE, start, end );
+	}
+	else // if dense or zeros
+	{
+//printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
+		bli_get_range_r2l( thr, all_start, all_end, block_factor,
+		                   start, end );
+	}
+}
+
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+	if ( bli_is_upper_or_lower( uplo ) )
+	{
+		bli_toggle_uplo( uplo );
+		bli_get_range_weighted( thr, all_start, all_end, block_factor,
+		                        uplo, FALSE, start, end );
+	}
+	else // if dense or zeros
+	{
+		bli_get_range_t2b( thr, all_start, all_end, block_factor,
+		                   start, end );
+	}
+}
+
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
+{
+	if ( bli_is_upper_or_lower( uplo ) )
+	{
+		bli_get_range_weighted( thr, all_start, all_end, block_factor,
+		                        uplo, TRUE, start, end );
+	}
+	else // if dense or zeros
+	{
+		bli_get_range_b2t( thr, all_start, all_end, block_factor,
+		                   start, end );
+	}
 }
 
 
diff --git a/frame/base/bli_threading.h b/frame/base/bli_threading.h
index ce4b21c55..e8aaa7bb6 100644
--- a/frame/base/bli_threading.h
+++ b/frame/base/bli_threading.h
@@ -127,8 +127,40 @@ typedef struct thrinfo_s thrinfo_t;
 #define thread_obarrier( thread )              bli_barrier( thread->ocomm, thread->ocomm_id )
 #define thread_ibarrier( thread )              bli_barrier( thread->icomm, thread->icomm_id )
 
-void bli_get_range( void* thread, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end );
-void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t forward, dim_t* start, dim_t* end);
+void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
+                    dim_t block_factor,
+                    bool_t handle_edge_low,
+                    dim_t* start, dim_t* end );
+void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
+                        dim_t block_factor,
+                        dim_t* start, dim_t* end );
+void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
+                        dim_t block_factor,
+                        dim_t* start, dim_t* end );
+void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
+                        dim_t block_factor,
+                        dim_t* start, dim_t* end );
+void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
+                        dim_t block_factor,
+                        dim_t* start, dim_t* end );
+
+void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
+                             dim_t block_factor, uplo_t uplo,
+                             bool_t handle_edge_low,
+                             dim_t* start, dim_t* end );
+void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
+                                 dim_t block_factor, uplo_t uplo,
+                                 dim_t* start, dim_t* end );
+void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
+                                 dim_t block_factor, uplo_t uplo,
+                                 dim_t* start, dim_t* end );
+void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
+                                 dim_t block_factor, uplo_t uplo,
+                                 dim_t* start, dim_t* end );
+void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
+                                 dim_t block_factor, uplo_t uplo,
+                                 dim_t* start, dim_t* end );
+
 thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, 
                                    thread_comm_t* icomm, dim_t icomm_id,
                                    dim_t n_way, dim_t work_id );
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 1de889ed5..10bb01f5c 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -378,12 +378,12 @@
 	} \
 }
 
-#define bli_obj_apply_trans( trans, obj )\
+#define bli_obj_apply_trans( trans, obj ) \
 { \
 	(obj).info = ( (obj).info ^ (trans) ); \
 }
 
-#define bli_obj_apply_conj( conjval, obj )\
+#define bli_obj_apply_conj( conjval, obj ) \
 { \
 	(obj).info = ( (obj).info ^ (conjval) ); \
 }
@@ -395,21 +395,25 @@
 \
 	((obj).root)
 
+#define bli_obj_root_uplo( obj ) \
+\
+	bli_obj_uplo( *bli_obj_root( obj ) )
+
 #define bli_obj_root_is_general( obj ) \
 \
-	bli_obj_is_general( *bli_obj_root( obj ) ) \
+	bli_obj_is_general( *bli_obj_root( obj ) )
 
 #define bli_obj_root_is_hermitian( obj ) \
 \
-	bli_obj_is_hermitian( *bli_obj_root( obj ) ) \
+	bli_obj_is_hermitian( *bli_obj_root( obj ) )
 
 #define bli_obj_root_is_symmetric( obj ) \
 \
-	bli_obj_is_symmetric( *bli_obj_root( obj ) ) \
+	bli_obj_is_symmetric( *bli_obj_root( obj ) )
 
 #define bli_obj_root_is_triangular( obj ) \
 \
-	bli_obj_is_triangular( *bli_obj_root( obj ) ) \
+	bli_obj_is_triangular( *bli_obj_root( obj ) )
 
 #define bli_obj_root_is_herm_or_symm( obj ) \
 \
@@ -418,11 +422,11 @@
 
 #define bli_obj_root_is_upper( obj ) \
 \
-	bli_obj_is_upper( *bli_obj_root( obj ) ) \
+	bli_obj_is_upper( *bli_obj_root( obj ) )
 
 #define bli_obj_root_is_lower( obj ) \
 \
-	bli_obj_is_lower( *bli_obj_root( obj ) ) \
+	bli_obj_is_lower( *bli_obj_root( obj ) )
 
 
 // Root matrix modification