Replaced register blocksize hack with querying the register blocksize for determining parallelism granularity

This commit is contained in:
Tyler Smith
2014-04-30 12:28:00 -05:00
parent f4fdfe8fc5
commit 456df03721
13 changed files with 38 additions and 19 deletions

View File

@@ -83,7 +83,9 @@ void bli_gemm_blk_var1f( obj_t* a,
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
bli_get_range( thread, 0, m_trans, BLIS_DEFAULT_MR_D, &start, &end );
bli_get_range( thread, 0, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,7 +82,9 @@ void bli_gemm_blk_var2f( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range( thread, 0, n_trans, BLIS_DEFAULT_NR_D, &start, &end );
bli_get_range( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,7 +82,9 @@ void bli_herk_blk_var1f( obj_t* a,
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
bli_get_range_weighted( thread, 0, m_trans, BLIS_DEFAULT_MR_D, bli_obj_is_upper( *c ), &start, &end );
bli_get_range_weighted( thread, 0, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -90,8 +90,9 @@ void bli_herk_blk_var2f( obj_t* a,
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end );
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_lower( *c ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -94,7 +94,10 @@ void bli_trmm_blk_var1f( obj_t* a,
bli_obj_width_after_trans( *a );
dim_t start, end;
bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end );
bli_get_range_weighted( thread, offA, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )
{

View File

@@ -82,8 +82,9 @@ void bli_trmm_blk_var2b( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_upper( *c ), &start, &end );
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -82,8 +82,9 @@ void bli_trmm_blk_var2f( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, bli_obj_is_lower( *c ), &start, &end );
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_lower( *c ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -190,7 +190,7 @@ void PASTEMAC(ch,varname)( \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
/*inc_t cstep_b; */\
inc_t rstep_c, cstep_c; \
inc_t ss_b; \
auxinfo_t aux; \
@@ -271,7 +271,7 @@ void PASTEMAC(ch,varname)( \
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
/*cstep_b = ps_b; */\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \

View File

@@ -190,7 +190,7 @@ void PASTEMAC(ch,varname)( \
dim_t off_b0111; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
/*inc_t cstep_b; */\
inc_t rstep_c, cstep_c; \
inc_t ss_b; \
auxinfo_t aux; \
@@ -272,7 +272,7 @@ void PASTEMAC(ch,varname)( \
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
/*cstep_b = ps_b; */\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \

View File

@@ -82,7 +82,9 @@ void bli_trsm_blk_var1b( obj_t* a,
bli_obj_width_after_trans( *a );
dim_t start, end;
bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end );
bli_get_range_weighted( thread, offA, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -81,7 +81,9 @@ void bli_trsm_blk_var1f( obj_t* a,
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
dim_t start, end;
bli_get_range( thread, offA, m_trans, BLIS_DEFAULT_MR_D, &start, &end );
bli_get_range( thread, offA, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -83,7 +83,9 @@ void bli_trsm_blk_var2b( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 0, &start, &end );
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )

View File

@@ -83,8 +83,9 @@ void bli_trsm_blk_var2f( obj_t* a,
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
//bli_get_range( thread, 0, n_trans, 8, &start, &end );
bli_get_range_weighted( thread, 0, n_trans, BLIS_DEFAULT_NR_D, 1, &start, &end );
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_lower( *c ), &start, &end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )