Load balance thread ranges for arbitrary diagonals.

Details:
- Expanded/updated interface for bli_get_range_weighted() and
  bli_get_range() so that the direction of movement is specified in the
  function name (e.g. bli_get_range_l2r(), bli_get_range_weighted_t2b())
  and also so that the object being partitioned is passed instead of an
  uplo parameter. Updated invocations in level-3 blocked variants, as
  appropriate.
- (Re)implemented bli_get_range_*() and bli_get_range_weighted_*() to
  carefully take into account the location of the diagonal when computing
  ranges so that the area of each subpartition (which, in all present
  level-3 operations, is proportional to the amount of computation
  engendered) is as equal as possible.
- Added calls to a new class of routines to all non-gemm level-3 blocked
  variants:
    bli_<oper>_prune_unref_mparts_[mnk]()
  where <oper> is herk, trmm, or trsm and [mnk] is chosen based on which
  dimension is being partitioned. These routines call a more basic
  routine, bli_prune_unref_mparts(), to prune unreferenced/unstored
  regions from matrices and simultaneously adjust other matrices which
  share the same dimension accordingly.
- Simplified herk_blk_var2f, trmm_blk_var1f/b as a result of more the
  new pruning routines.
- Fixed incorrect blocking factors passed into bli_get_range_*() in
  bli_trsm_blk_var[12][fb].c
- Added a new test driver in test/thread_ranges that can exercise the new
  bli_get_range_*() and bli_get_range_weighted_*() under a range of
  conditions.
- Reimplemented m and n fields of obj_t as elements in a "dim"
  array field so that dimensions could be queried via index constant
  (e.g. BLIS_M, BLIS_N). Adjusted/added query and modification
  macros accordingly.
- Defined mdim_t type to enumerate BLIS_M and BLIS_N indexing values.
- Added bli_round() macro, which calls C math library function round(),
  and bli_round_to_mult(), which rounds a value to the nearest multiple
  of some other value.
- Added miscellaneous pruning- and mdim_t-related macros.
- Renamed bli_obj_row_offset(), bli_obj_col_offset() macros to
  bli_obj_row_off(), bli_obj_col_off().
This commit is contained in:
Field G. Van Zee
2015-09-24 12:14:03 -05:00
parent 4dd9dd3e1d
commit e2e9d64a63
39 changed files with 1875 additions and 377 deletions

View File

@@ -51,7 +51,6 @@ void bli_gemm_blk_var1f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
@@ -80,21 +79,19 @@ void bli_gemm_blk_var1f( obj_t* a,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
bli_get_range_t2b( thread, 0, m_trans,
dim_t my_start, my_end;
bli_get_range_t2b( thread, a,
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
&start, &end );
&my_start, &my_end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -50,7 +50,6 @@ void bli_gemm_blk_var2f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t n_trans;
if( thread_am_ochief( thread ) ) {
@@ -79,21 +78,19 @@ void bli_gemm_blk_var2f( obj_t* a,
cntl_sub_packm_a( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range_l2r( thread, 0, n_trans,
dim_t my_start, my_end;
bli_get_range_l2r( thread, b,
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
&start, &end );
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of b (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, end, b,
b_alg = bli_determine_blocksize_f( i, my_end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.

View File

@@ -59,7 +59,6 @@ void bli_gemm_blk_var4f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
@@ -88,21 +87,19 @@ void bli_gemm_blk_var4f( obj_t* a,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
dim_t start, end;
bli_get_range_t2b( thread, 0, m_trans,
dim_t my_start, my_end;
bli_get_range_t2b( thread, a,
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
&start, &end );
&my_start, &my_end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -35,6 +35,7 @@
#include "bli_herk_check.h"
#include "bli_herk_front.h"
#include "bli_herk_int.h"
#include "bli_herk_prune.h"
#include "bli_herk_blk_var1f.h"

View File

@@ -50,7 +50,9 @@ void bli_herk_blk_var1f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_m( a, ah, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A'.
@@ -79,18 +81,16 @@ void bli_herk_blk_var1f( obj_t* a,
cntl_sub_packm_b( cntl ),
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
dim_t start, end;
bli_get_range_weighted_t2b( thread, 0, m_trans,
dim_t my_start, my_end;
bli_get_range_weighted_t2b( thread, c,
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
bli_obj_root_uplo( *c ), &start, &end );
&my_start, &my_end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -41,23 +41,18 @@ void bli_herk_blk_var2f( obj_t* a,
herk_thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t ah1_pack_s, c1S_pack_s;
obj_t ah1_pack_s, c1_pack_s;
obj_t ah1, c1, c1S;
obj_t aS_pack;
obj_t ah1, c1;
obj_t* a_pack;
obj_t* ah1_pack;
obj_t* c1S_pack;
obj_t* c1_pack;
dim_t i;
dim_t b_alg;
dim_t n_trans;
subpart_t stored_part;
// The upper and lower variants are identical, except for which
// merged subpartition is acquired in the loop body.
if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B;
else stored_part = BLIS_SUBPART1T;
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_n( a, ah, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
@@ -75,30 +70,26 @@ void bli_herk_blk_var2f( obj_t* a,
// Initialize pack objects for C and A' that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &ah1_pack_s );
bli_obj_init_pack( &c1S_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
c1S_pack = thread_ibroadcast( thread, &c1S_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *c );
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
bli_get_range_weighted_l2r( thread, 0, n_trans,
dim_t my_start, my_end;
bli_get_range_weighted_l2r( thread, c,
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
bli_obj_root_uplo( *c ), &start, &end );
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1' and C1.
@@ -107,18 +98,11 @@ void bli_herk_blk_var2f( obj_t* a,
bli_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Partition off the stored region of C1 and the corresponding region
// of A_pack.
bli_acquire_mpart_t2b( stored_part,
i, b_alg, &c1, &c1S );
bli_acquire_mpart_t2b( stored_part,
i, b_alg, a_pack, &aS_pack );
// Initialize objects for packing A1' and C1.
if( thread_am_ichief( thread ) ) {
bli_packm_init( &ah1, ah1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1S, c1S_pack,
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread ) ;
@@ -129,23 +113,23 @@ void bli_herk_blk_var2f( obj_t* a,
herk_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1S, c1S_pack,
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
herk_thread_sub_ipackm( thread ) ) ;
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
&aS_pack,
a_pack,
ah1_pack,
&BLIS_ONE,
c1S_pack,
c1_pack,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1S_pack, &c1S,
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
}
@@ -157,7 +141,7 @@ void bli_herk_blk_var2f( obj_t* a,
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
bli_packm_release( c1S_pack, cntl_sub_packm_c( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -52,6 +52,9 @@ void bli_herk_blk_var3f( obj_t* a,
dim_t b_alg;
dim_t k_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_k( a, ah, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing C.
bli_obj_init_pack( &c_pack_s );

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_herk_prune_unref_mparts_m( obj_t* a,
obj_t* ah,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of C (that would
// be encountered from partitioning in the m dimension) and adjust the
// subpartition of A accordingly.
bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M );
}
void bli_herk_prune_unref_mparts_n( obj_t* a,
obj_t* ah,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of C (that would
// be encountered from partitioning in the n dimension) and adjust the
// subpartition of Ah accordingly.
bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N );
}
void bli_herk_prune_unref_mparts_k( obj_t* a,
obj_t* ah,
obj_t* c )
{
// As long as A and Ah are general in structure, no pruning should be
// for the k dimension.
}

View File

@@ -0,0 +1,38 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_herk_prune_unref_mparts_m( obj_t* a, obj_t* ah, obj_t* c );
void bli_herk_prune_unref_mparts_n( obj_t* a, obj_t* ah, obj_t* c );
void bli_herk_prune_unref_mparts_k( obj_t* a, obj_t* ah, obj_t* c );

View File

@@ -36,6 +36,7 @@
#include "bli_trmm_check.h"
#include "bli_trmm_front.h"
#include "bli_trmm_int.h"
#include "bli_trmm_prune.h"
#include "bli_trmm_blk_var1f.h"

View File

@@ -50,8 +50,9 @@ void bli_trmm_blk_var1f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_m( a, b, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
@@ -81,28 +82,28 @@ void bli_trmm_blk_var1f( obj_t* a,
trmm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
offA = 0;
//m_trans = bli_obj_length_after_trans( *a );
//offA = 0;
// If A is lower triangular, we have to adjust where the non-zero part of
// A begins. If A is upper triangular, we have to adjust the length of
// the non-zero part. If A is general/dense, then we keep the defaults.
if ( bli_obj_is_lower( *a ) )
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
else if ( bli_obj_is_upper( *a ) )
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
bli_obj_width_after_trans( *a );
//if ( bli_obj_is_lower( *a ) )
// offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
//else if ( bli_obj_is_upper( *a ) )
// m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
// bli_obj_width_after_trans( *a );
dim_t start, end;
bli_get_range_weighted_t2b( thread, offA, m_trans,
dim_t my_start, my_end;
bli_get_range_weighted_t2b( thread, a,
bli_blksz_get_mult_for_obj( a, cntl_blocksize( cntl ) ),
bli_obj_root_uplo( *a ), &start, &end );
&my_start, &my_end );
// Partition along the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -50,8 +50,9 @@ void bli_trmm_blk_var2b( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_n( a, b, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
@@ -79,18 +80,16 @@ void bli_trmm_blk_var2b( obj_t* a,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range_weighted_r2l( thread, 0, n_trans,
dim_t my_start, my_end;
bli_get_range_weighted_r2l( thread, b,
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
bli_obj_root_uplo( *b ), &start, &end );
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, end, b,
b_alg = bli_determine_blocksize_b( i, my_end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.

View File

@@ -50,8 +50,9 @@ void bli_trmm_blk_var2f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_n( a, b, c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
@@ -79,18 +80,16 @@ void bli_trmm_blk_var2f( obj_t* a,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
bli_get_range_weighted_l2r( thread, 0, n_trans,
dim_t my_start, my_end;
bli_get_range_weighted_l2r( thread, b,
bli_blksz_get_mult_for_obj( b, cntl_blocksize( cntl ) ),
bli_obj_root_uplo( *b ), &start, &end );
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, b,
b_alg = bli_determine_blocksize_f( i, my_end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.

View File

@@ -52,6 +52,9 @@ void bli_trmm_blk_var3b( obj_t* a,
dim_t b_alg;
dim_t k_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_k( a, b, c );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );

View File

@@ -52,6 +52,9 @@ void bli_trmm_blk_var3f( obj_t* a,
dim_t b_alg;
dim_t k_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_k( a, b, c );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );

View File

@@ -0,0 +1,71 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmm_prune_unref_mparts_m( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of A (that would
// be encountered from partitioning in the m dimension) and adjust the
// subpartition of C accordingly.
bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
}
void bli_trmm_prune_unref_mparts_n( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of B (that would
// be encountered from partitioning in the n dimension) and adjust the
// subpartition of C accordingly.
bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
}
void bli_trmm_prune_unref_mparts_k( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of A (that would
// be encountered from partitioning in the k dimension) and adjust the
// subpartition of B accordingly.
bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
// Prune any unreferenced part from the subpartition of B (that would
// be encountered from partitioning in the k dimension) and adjust the
// subpartition of A accordingly.
bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
}

View File

@@ -0,0 +1,38 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trmm_prune_unref_mparts_m( obj_t* a, obj_t* b, obj_t* c );
void bli_trmm_prune_unref_mparts_n( obj_t* a, obj_t* b, obj_t* c );
void bli_trmm_prune_unref_mparts_k( obj_t* a, obj_t* b, obj_t* c );

View File

@@ -37,6 +37,7 @@
#include "bli_trsm_check.h"
#include "bli_trsm_front.h"
#include "bli_trsm_int.h"
#include "bli_trsm_prune.h"
#include "bli_gemmtrsm_ukernel.h"
#include "bli_trsm_ukernel.h"

View File

@@ -49,8 +49,9 @@ void bli_trsm_blk_var1b( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_m( a, b, c );
// Initialize object for packing B.
if( thread_am_ochief( thread ) ) {
@@ -71,28 +72,19 @@ void bli_trsm_blk_var1b( obj_t* a,
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
offA = 0;
// If A is upper triangular, we have to adjust where the non-zero part of
// A begins.
if ( bli_obj_is_upper( *a ) )
offA = m_trans - bli_abs( bli_obj_diag_offset_after_trans( *a ) ) -
bli_obj_width_after_trans( *a );
dim_t start, end;
dim_t my_start, my_end;
num_t dt = bli_obj_execution_datatype( *a );
bli_get_range_b2t( thread, offA, m_trans,
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
bli_info_get_default_mc( BLIS_TRSM, dt ),
&start, &end );
dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
bli_info_get_default_mr( BLIS_TRSM, dt ) :
bli_info_get_default_nr( BLIS_TRSM, dt ) );
bli_get_range_b2t( thread, a, bf,
&my_start, &my_end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, end, a,
b_alg = bli_determine_blocksize_b( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -49,8 +49,9 @@ void bli_trsm_blk_var1f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_m( a, b, c );
// Initialize object for packing B.
if( thread_am_ochief( thread ) ) {
@@ -71,27 +72,19 @@ void bli_trsm_blk_var1f( obj_t* a,
cntl_sub_packm_b( cntl ),
trsm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
offA = 0;
// If A is lower triangular, we have to adjust where the non-zero part of
// A begins.
if ( bli_obj_is_lower( *a ) )
offA = bli_abs( bli_obj_diag_offset_after_trans( *a ) );
dim_t start, end;
dim_t my_start, my_end;
num_t dt = bli_obj_execution_datatype( *a );
bli_get_range_t2b( thread, offA, m_trans,
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ), bli_info_get_default_mr( BLIS_TRSM, dt ) ),
bli_info_get_default_mc( BLIS_TRSM, dt ),
&start, &end );
dim_t bf = ( bli_obj_root_is_triangular( *a ) ?
bli_info_get_default_mr( BLIS_TRSM, dt ) :
bli_info_get_default_nr( BLIS_TRSM, dt ) );
bli_get_range_t2b( thread, a, bf,
&my_start, &my_end );
// Partition along the remaining portion of the m dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, a,
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.

View File

@@ -50,7 +50,9 @@ void bli_trsm_blk_var2b( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_n( a, b, c );
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
@@ -78,24 +80,21 @@ void bli_trsm_blk_var2b( obj_t* a,
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
trsm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
dim_t my_start, my_end;
num_t dt = bli_obj_execution_datatype( *a );
bli_get_range_r2l( thread, 0, n_trans,
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
// bli_info_get_default_mr( BLIS_TRSM, dt ) ),
bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
&start, &end );
dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
bli_info_get_default_mr( BLIS_TRSM, dt ) :
bli_info_get_default_nr( BLIS_TRSM, dt ) );
bli_get_range_r2l( thread, b, bf,
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, end, b,
b_alg = bli_determine_blocksize_b( i, my_end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.

View File

@@ -50,7 +50,9 @@ void bli_trsm_blk_var2f( obj_t* a,
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_n( a, b, c );
// Initialize pack objects for A that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
@@ -78,24 +80,21 @@ void bli_trsm_blk_var2f( obj_t* a,
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
trsm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
dim_t start, end;
dim_t my_start, my_end;
num_t dt = bli_obj_execution_datatype( *a );
bli_get_range_l2r( thread, 0, n_trans,
//bli_lcm( bli_info_get_default_nr( BLIS_TRSM, dt ),
// bli_info_get_default_mr( BLIS_TRSM, dt ) ),
bli_lcm( bli_blksz_get_nr( dt, cntl_blocksize( cntl ) ),
bli_blksz_get_mr( dt, cntl_blocksize( cntl ) ) ),
&start, &end );
dim_t bf = ( bli_obj_root_is_triangular( *b ) ?
bli_info_get_default_mr( BLIS_TRSM, dt ) :
bli_info_get_default_nr( BLIS_TRSM, dt ) );
bli_get_range_l2r( thread, b, bf,
&my_start, &my_end );
// Partition along the n dimension.
for ( i = start; i < end; i += b_alg )
for ( i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, end, b,
b_alg = bli_determine_blocksize_f( i, my_end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.

View File

@@ -52,6 +52,9 @@ void bli_trsm_blk_var3b( obj_t* a,
dim_t b_alg;
dim_t k_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_k( a, b, c );
// Initialize pack objects for C that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &c_pack_s );

View File

@@ -52,6 +52,9 @@ void bli_trsm_blk_var3f( obj_t* a,
dim_t b_alg;
dim_t k_trans;
// Prune any zero region that exists along the partitioning dimension.
bli_trsm_prune_unref_mparts_k( a, b, c );
// Initialize pack objects for C that are passed into packm_init().
if( thread_am_ochief( thread ) ) {
bli_obj_init_pack( &c_pack_s );

View File

@@ -0,0 +1,71 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsm_prune_unref_mparts_m( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of A (that would
// be encountered from partitioning in the m dimension) and adjust the
// subpartition of C accordingly.
bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
}
void bli_trsm_prune_unref_mparts_n( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of B (that would
// be encountered from partitioning in the n dimension) and adjust the
// subpartition of C accordingly.
bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
}
void bli_trsm_prune_unref_mparts_k( obj_t* a,
obj_t* b,
obj_t* c )
{
// Prune any unreferenced part from the subpartition of A (that would
// be encountered from partitioning in the k dimension) and adjust the
// subpartition of B accordingly.
bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
// Prune any unreferenced part from the subpartition of B (that would
// be encountered from partitioning in the k dimension) and adjust the
// subpartition of A accordingly.
bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
}

View File

@@ -0,0 +1,38 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trsm_prune_unref_mparts_m( obj_t* a, obj_t* b, obj_t* c );
void bli_trsm_prune_unref_mparts_n( obj_t* a, obj_t* b, obj_t* c );
void bli_trsm_prune_unref_mparts_k( obj_t* a, obj_t* b, obj_t* c );

View File

@@ -509,8 +509,8 @@ void bli_obj_print( char* label, obj_t* obj )
( unsigned long int )bli_obj_width( *obj ) );
fprintf( file, "\n" );
fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_offset( *obj ),
( unsigned long int )bli_obj_col_offset( *obj ) );
fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long int )bli_obj_row_off( *obj ),
( unsigned long int )bli_obj_col_off( *obj ) );
fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( *obj ) );
fprintf( file, "\n" );

View File

@@ -592,6 +592,16 @@ void bli_acquire_mpart_tl2br( subpart_t requested_part,
requested_part != BLIS_SUBPART11 &&
requested_part != BLIS_SUBPART22 )
{
// FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal
// intersecting subpartitions should inherit their root object's
// uplo field, or it should not. Right now, they DO inherit the
// uplo (because they are not set to BLIS_DENSE when the diagonal
// does not intersect). But the whole point of being able to query
// the root object's properties (e.g. uplo field) was so that we
// COULD mark such subpartitions as dense, to make it easier for
// certain subproblems on those subpartitions--subproblems that
// are agnostic to where the subpartition came from.
// NOTE: This comment may be out-of-date since we now distinguish
// between uplo properties for the current and root objects...
// Note that we cannot mark the subpartition object as general/dense

135
frame/base/bli_prune.c Normal file
View File

@@ -0,0 +1,135 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
obj_t* s, mdim_t mdim_s )
{
// If the primary object is general, it has no structure, and
// therefore, no unreferenced parts.
if ( bli_obj_is_general( *p ) ) return;
// If the primary object is BLIS_ZEROS, set the dimensions so that the
// matrix is empty. This is not strictly needed but rather a minor
// optimization, as it would prevent threads that would otherwise get
// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
// because bli_get_range*() would return empty ranges, which would
// cause the variant's for loop from executing any iterations.
// NOTE: this should only ever execute if the primary object is
// triangular because that is the only structure type with subpartitions
// that can be marked as BLIS_ZEROS.
if ( bli_obj_is_triangular( *p ) &&
bli_obj_is_zeros( *p ) ) { bli_obj_set_dim( mdim_p, 0, *p );
bli_obj_set_dim( mdim_s, 0, *s );
return; }
// If the primary object is hermitian, symmetric, or triangular, we
// assume that the unstored region will be unreferenced (otherwise,
// the caller should not be invoking this function on that object).
//if ( bli_obj_is_herm_or_symm( *p ) ||
// bli_obj_is_triangular( *p ) )
{
doff_t diagoff_p = bli_obj_diag_offset( *p );
dim_t m = bli_obj_length( *p );
dim_t n = bli_obj_width( *p );
uplo_t uplo = bli_obj_uplo( *p );
dim_t off_inc = 0;
dim_t q;
// Support implicit transposition on p and s.
if ( bli_obj_has_trans( *p ) )
{
bli_reflect_about_diag( diagoff_p, uplo, m, n );
bli_toggle_dim( mdim_p );
}
if ( bli_obj_has_trans( *s ) )
{
bli_toggle_dim( mdim_s );
}
// Prune away any zero region of the matrix depending on the
// dimension of the primary object being partitioned and the
// triangle in which it is stored.
if ( bli_obj_is_lower( *p ) )
{
if ( bli_is_m_dim( mdim_p ) )
{ bli_prune_unstored_region_top_l( diagoff_p, m, n, off_inc ); }
else // if ( bli_is_n_dim( mdim_p ) )
{ bli_prune_unstored_region_right_l( diagoff_p, m, n, off_inc ); }
}
else if ( bli_obj_is_upper( *p ) )
{
if ( bli_is_m_dim( mdim_p ) )
{ bli_prune_unstored_region_bottom_u( diagoff_p, m, n, off_inc ); }
else // if ( bli_is_n_dim( mdim_p ) )
{ bli_prune_unstored_region_left_u( diagoff_p, m, n, off_inc ); }
}
else if ( bli_obj_is_dense( *p ) )
{
// Hermitian, symmetric, and triangular matrices are almost
// never dense, but if one were found to be dense, it would
// have no unreferenced regions to prune.
return;
}
else // if ( bli_obj_is_zeros( *p ) )
{
// Sanity check. Hermitian/symmetric matrices should never have
// zero subpartitions.
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Select the (potentially modified) dimension along which we are
// partitioning.
if ( bli_is_m_dim( mdim_p ) ) q = m;
else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n;
// Update the affected objects in case anything changed. Notice that
// it is okay to update the dimension and diagonal offset fields of
// packed primary objects, as long as we do so in tandem with the
// secondary object to maintain conformality. This just means that
// the "ignore-able" zero region is skipped over here, rather than
// within the macro-kernel.
bli_obj_set_diag_offset( diagoff_p, *p );
bli_obj_set_dim( mdim_p, q, *p );
bli_obj_set_dim( mdim_s, q, *s );
// Only update the affected offset fields if the object in question
// is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will
// compute the wrong address within the macro-kernel object wrapper.
if ( !bli_obj_is_packed( *p ) ) { bli_obj_inc_off( mdim_p, off_inc, *p ); }
if ( !bli_obj_is_packed( *s ) ) { bli_obj_inc_off( mdim_s, off_inc, *s ); }
}
}

36
frame/base/bli_prune.h Normal file
View File

@@ -0,0 +1,36 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
obj_t* s, mdim_t mdim_s );

View File

@@ -157,16 +157,19 @@ void* bli_broadcast_structure( thread_comm_t* communicator, dim_t id, void* to_s
}
// Code for work assignments
void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, bool_t handle_edge_low, dim_t* start, dim_t* end )
void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low, dim_t* start, dim_t* end )
{
thrinfo_t* thread = ( thrinfo_t* )thr;
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
dim_t all_start = 0;
dim_t all_end = n;
dim_t size = all_end - all_start;
dim_t n_bf_whole = size / block_factor;
dim_t n_bf_left = size % block_factor;
dim_t n_bf_whole = size / bf;
dim_t n_bf_left = size % bf;
dim_t n_bf_lo = n_bf_whole / n_way;
dim_t n_bf_hi = n_bf_whole / n_way;
@@ -217,8 +220,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
// Compute the actual widths (in units of rows/columns) of
// individual threads in the low and high groups.
dim_t size_lo = n_bf_lo * block_factor;
dim_t size_hi = n_bf_hi * block_factor;
dim_t size_lo = n_bf_lo * bf;
dim_t size_hi = n_bf_hi * bf;
// Precompute the starting indices of the low and high groups.
dim_t lo_start = all_start;
@@ -257,8 +260,8 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
// Compute the actual widths (in units of rows/columns) of
// individual threads in the low and high groups.
dim_t size_lo = n_bf_lo * block_factor;
dim_t size_hi = n_bf_hi * block_factor;
dim_t size_lo = n_bf_lo * bf;
dim_t size_hi = n_bf_hi * bf;
// Precompute the starting indices of the low and high groups.
dim_t lo_start = all_start;
@@ -288,188 +291,514 @@ void bli_get_range( void* thr, dim_t all_start, dim_t all_end, dim_t block_facto
}
}
void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
bli_get_range( thr, all_start, all_end, block_factor,
dim_t m = bli_obj_length_after_trans( *a );
dim_t n = bli_obj_width_after_trans( *a );
bli_get_range( thr, n, bf,
FALSE, start, end );
return m * ( *end - *start );
}
void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
bli_get_range( thr, all_start, all_end, block_factor,
dim_t m = bli_obj_length_after_trans( *a );
dim_t n = bli_obj_width_after_trans( *a );
bli_get_range( thr, n, bf,
TRUE, start, end );
return m * ( *end - *start );
}
void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
bli_get_range( thr, all_start, all_end, block_factor,
dim_t m = bli_obj_length_after_trans( *a );
dim_t n = bli_obj_width_after_trans( *a );
bli_get_range( thr, m, bf,
FALSE, start, end );
return n * ( *end - *start );
}
void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, dim_t* start, dim_t* end )
siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
bli_get_range( thr, all_start, all_end, block_factor,
dim_t m = bli_obj_length_after_trans( *a );
dim_t n = bli_obj_width_after_trans( *a );
bli_get_range( thr, m, bf,
TRUE, start, end );
return n * ( *end - *start );
}
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, bool_t handle_edge_low, dim_t* start, dim_t* end )
dim_t bli_get_range_width_l( doff_t diagoff_j,
dim_t m,
dim_t n_j,
dim_t j,
dim_t n_way,
dim_t bf,
dim_t bf_left,
double area_per_thr,
bool_t handle_edge_low )
{
dim_t width;
// In this function, we assume that we are somewhere in the process of
// partitioning an m x n lower-stored region (with arbitrary diagonal
// offset) n_ways along the n dimension (into column panels). The value
// j identifies the left-to-right subpartition index (from 0 to n_way-1)
// of the subpartition whose width we are about to compute using the
// area per thread determined by the caller. n_j is the number of
// columns in the remaining region of the matrix being partitioned,
// and diagoff_j is that region's diagonal offset.
// If this is the last subpartition, the width is simply equal to n_j.
// Note that this statement handles cases where the "edge case" (if
// one exists) is assigned to the high end of the index range (ie:
// handle_edge_low == FALSE).
if ( j == n_way - 1 ) return n_j;
// At this point, we know there are at least two subpartitions left.
// We also know that IF the submatrix contains a completely dense
// rectangular submatrix, it will occur BEFORE the triangular (or
// trapezoidal) part.
// Here, we implement a somewhat minor load balancing optimization
// that ends up getting employed only for relatively small matrices.
// First, recall that all subpartition widths will be some multiple
// of the blocking factor bf, except perhaps either the first or last
// subpartition, which will receive the edge case, if it exists.
// Also recall that j represents the current thread (or thread group,
// or "caucus") for which we are computing a subpartition width.
// If n_j is sufficiently small that we can only allocate bf columns
// to each of the remaining threads, then we set the width to bf. We
// do not allow the subpartition width to be less than bf, so, under
// some conditions, if n_j is small enough, some of the reamining
// threads may not get any work. For the purposes of this lower bound
// on work (ie: width >= bf), we allow the edge case to count as a
// "full" set of bf columns.
{
dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 );
if ( n_j_bf <= n_way - j )
{
if ( j == 0 && handle_edge_low )
width = ( bf_left > 0 ? bf_left : bf );
else
width = bf;
// Make sure that the width does not exceed n_j. This would
// occur if and when n_j_bf < n_way - j; that is, when the
// matrix being partitioned is sufficiently small relative to
// n_way such that there is not even enough work for every
// (remaining) thread to get bf (or bf_left) columns. The
// net effect of this safeguard is that some threads may get
// assigned empty ranges (ie: no work), which of course must
// happen in some situations.
if ( width > n_j ) width = n_j;
return width;
}
}
// This block computes the width assuming that we are entirely within
// a dense rectangle that precedes the triangular (or trapezoidal)
// part.
{
// First compute the width of the current panel under the
// assumption that the diagonal offset would not intersect.
width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m );
// Adjust the width, if necessary. Specifically, we may need
// to allocate the edge case to the first subpartition, if
// requested; otherwise, we just need to ensure that the
// subpartition is a multiple of the blocking factor.
if ( j == 0 && handle_edge_low )
{
if ( width % bf != bf_left ) width += bf_left - ( width % bf );
}
else // if interior case
{
// Round up to the next multiple of the blocking factor.
//if ( width % bf != 0 ) width += bf - ( width % bf );
// Round to the nearest multiple of the blocking factor.
if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
}
}
// We need to recompute width if the panel, according to the width
// as currently computed, would intersect the diagonal.
if ( diagoff_j < width )
{
dim_t offm_inc, offn_inc;
// Prune away the unstored region above the diagonal, if it exists.
// Note that the entire region was pruned initially, so we know that
// we don't need to try to prune the right side. (Also, we discard
// the offset deltas since we don't need to actually index into the
// subpartition.)
bli_prune_unstored_region_top_l( diagoff_j, m, n_j, offm_inc );
//bli_prune_unstored_region_right_l( diagoff_j, m, n_j, offn_inc );
// We don't need offm_inc, offn_inc here. These statements should
// prevent compiler warnings.
( void )offm_inc;
( void )offn_inc;
// Solve a quadratic equation to find the width of the current (jth)
// subpartition given the m dimension, diagonal offset, and area.
// NOTE: We know that the +/- in the quadratic formula must be a +
// here because we know that the desired solution (the subpartition
// width) will be smaller than (m + diagoff), not larger. If you
// don't believe me, draw a picture!
const double a = -0.5;
const double b = ( double )m + ( double )diagoff_j + 0.5;
const double c = -0.5 * ( ( double )diagoff_j *
( ( double )diagoff_j + 1.0 )
) - area_per_thr;
const double x = ( -b + sqrt( b * b - 4.0 * a * c ) ) / ( 2.0 * a );
// Use the rounded solution as our width, but make sure it didn't
// round to zero.
width = ( dim_t )bli_round( x );
if ( width == 0 ) width = 1;
// Adjust the width, if necessary.
if ( j == 0 && handle_edge_low )
{
if ( width % bf != bf_left ) width += bf_left - ( width % bf );
}
else // if interior case
{
// Round up to the next multiple of the blocking factor.
//if ( width % bf != 0 ) width += bf - ( width % bf );
// Round to the nearest multiple of the blocking factor.
if ( width % bf != 0 ) width = bli_round_to_mult( width, bf );
}
}
// Make sure that the width, after being adjusted, does not cause the
// subpartition to exceed n_j.
if ( width > n_j ) width = n_j;
return width;
}
siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff )
{
dim_t offm_inc = 0;
dim_t offn_inc = 0;
double tri_area;
double area;
// Prune away any rectangular region above where the diagonal
// intersects the left edge of the subpartition, if it exists.
bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
// Prune away any rectangular region to the right of where the
// diagonal intersects the bottom edge of the subpartition, if
// it exists. (This shouldn't ever be needed, since the caller
// would presumably have already performed rightward pruning,
// but it's here just in case.)
bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
( void )offm_inc;
( void )offn_inc;
// Compute the area of the empty triangle so we can subtract it
// from the area of the rectangle that bounds the subpartition.
if ( bli_intersects_diag_n( diagoff, m, n ) )
{
double tri_dim = ( double )( n - diagoff - 1 );
tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
}
else
{
// If the diagonal does not intersect the trapezoid, then
// we can compute the area as a simple rectangle.
tri_area = 0.0;
}
area = ( double )m * ( double )n - tri_area;
return ( siz_t )area;
}
siz_t bli_get_range_weighted( void* thr,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* j_start_thr,
dim_t* j_end_thr )
{
thrinfo_t* thread = ( thrinfo_t* )thr;
dim_t n_way = thread->n_way;
dim_t work_id = thread->work_id;
dim_t size = all_end - all_start;
dim_t width;
dim_t block_fac_leftover = size % block_factor;
dim_t i;
double num;
*start = 0;
*end = all_end - all_start;
num = size * size / ( double )n_way;
dim_t n_way = thread->n_way;
dim_t my_id = thread->work_id;
dim_t bf_left = n % bf;
dim_t j;
dim_t off_j;
doff_t diagoff_j;
dim_t n_left;
dim_t width_j;
dim_t offm_inc, offn_inc;
double tri_dim, tri_area;
double area_total, area_per_thr;
siz_t area = 0;
// In this function, we assume that the caller has already determined
// that (a) the diagonal intersects the submatrix, and (b) the submatrix
// is either lower- or upper-stored.
if ( bli_is_lower( uplo ) )
{
dim_t cur_caucus = n_way - 1;
dim_t len = 0;
// Prune away the unstored region above the diagonal, if it exists,
// and then to the right of where the diagonal intersects the bottom,
// if it exists. (Also, we discard the offset deltas since we don't
// need to actually index into the subpartition.)
bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc );
bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc );
// This loop computes subpartitions backwards, from the high end
// of the index range to the low end. If the low end is assumed
// to be on the left and the high end the right, this assignment
// of widths is appropriate for n dimension partitioning of a
// lower triangular matrix.
for ( i = 0; TRUE; ++i )
// We don't need offm_inc, offn_inc here. These statements should
// prevent compiler warnings.
( void )offm_inc;
( void )offn_inc;
// Now that pruning has taken place, we know that diagoff >= 0.
// Compute the total area of the submatrix, accounting for the
// location of the diagonal, and divide it by the number of ways
// of parallelism.
tri_dim = ( double )( n - diagoff - 1 );
tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0;
area_total = ( double )m * ( double )n - tri_area;
area_per_thr = area_total / ( double )n_way;
// Initialize some variables prior to the loop: the offset to the
// current subpartition, the remainder of the n dimension, and
// the diagonal offset of the current subpartition.
off_j = 0;
diagoff_j = diagoff;
n_left = n;
// Iterate over the subpartition indices corresponding to each
// thread/caucus participating in the n_way parallelism.
for ( j = 0; j < n_way; ++j )
{
width = ceil( sqrt( len*len + num ) ) - len;
// Compute the width of the jth subpartition, taking the
// current diagonal offset into account, if needed.
width_j = bli_get_range_width_l( diagoff_j, m, n_left,
j, n_way,
bf, bf_left,
area_per_thr,
handle_edge_low );
// If we need to allocate the edge case (assuming it exists)
// to the high thread subpartition, adjust width so that it
// contains the exact amount of leftover edge dimension so that
// all remaining subpartitions can be multiples of block_factor.
// If the edge case is to be allocated to the low subpartition,
// or if there is no edge case, it is implicitly allocated to
// the low subpartition by virtue of the fact that all other
// subpartitions already assigned will be multiples of
// block_factor.
if ( i == 0 && !handle_edge_low )
// If the current thread belongs to caucus j, this is his
// subpartition. So we compute the implied index range and
// end our search.
if ( j == my_id )
{
if ( width % block_factor != block_fac_leftover )
width += block_fac_leftover - ( width % block_factor );
}
else
{
if ( width % block_factor != 0 )
width += block_factor - ( width % block_factor );
*j_start_thr = off_j;
*j_end_thr = off_j + width_j;
area = bli_find_area_trap_l( m, width_j, diagoff_j );
break;
}
if ( cur_caucus == work_id )
{
*start = bli_max( 0, *end - width ) + all_start;
*end = *end + all_start;
return;
}
else
{
*end -= width;
len += width;
cur_caucus--;
}
// Shift the current subpartition's starting and diagonal offsets,
// as well as the remainder of the n dimension, according to the
// computed width, and then iterate to the next subpartition.
off_j += width_j;
diagoff_j -= width_j;
n_left -= width_j;
}
}
else // if ( bli_is_upper( uplo ) )
{
// This loop computes subpartitions forwards, from the low end
// of the index range to the high end. If the low end is assumed
// to be on the left and the high end the right, this assignment
// of widths is appropriate for n dimension partitioning of an
// upper triangular matrix.
for ( i = 0; TRUE; ++i )
// Express the upper-stored case in terms of the lower-stored case.
// First, we convert the upper-stored trapezoid to an equivalent
// lower-stored trapezoid by rotating it 180 degrees.
bli_rotate180_trapezoid( diagoff, uplo );
// Now that the trapezoid is "flipped" in the n dimension, negate
// the bool that encodes whether to handle the edge case at the
// low (or high) end of the index range.
bli_toggle_bool( handle_edge_low );
// Compute the appropriate range for the rotated trapezoid.
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
handle_edge_low,
j_start_thr, j_end_thr );
// Reverse the indexing basis for the subpartition ranges so that
// the indices, relative to left-to-right iteration through the
// unrotated upper-stored trapezoid, map to the correct columns
// (relative to the diagonal). This amounts to subtracting the
// range from n.
bli_reverse_index_direction( *j_start_thr, *j_end_thr, n );
}
return area;
}
siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
siz_t area;
// This function assigns area-weighted ranges in the n dimension
// where the total range spans 0 to n-1 with 0 at the left end and
// n-1 at the right end.
if ( bli_obj_intersects_diag( *a ) &&
bli_obj_is_upper_or_lower( *a ) )
{
doff_t diagoff = bli_obj_diag_offset( *a );
uplo_t uplo = bli_obj_uplo( *a );
dim_t m = bli_obj_length( *a );
dim_t n = bli_obj_width( *a );
// Support implicit transposition.
if ( bli_obj_has_trans( *a ) )
{
width = ceil( sqrt( *start * *start + num ) ) - *start;
if ( i == 0 && handle_edge_low )
{
if ( width % block_factor != block_fac_leftover )
width += block_fac_leftover - ( width % block_factor );
}
else
{
if ( width % block_factor != 0 )
width += block_factor - ( width % block_factor );
}
if ( work_id == 0 )
{
*start = *start + all_start;
*end = bli_min( *start + width, all_end );
return;
}
else
{
*start = *start + width;
work_id--;
}
bli_reflect_about_diag( diagoff, uplo, m, n );
}
}
}
void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
{
if ( bli_is_upper_or_lower( uplo ) )
{
bli_get_range_weighted( thr, all_start, all_end, block_factor,
uplo, FALSE, start, end );
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
FALSE, start, end );
}
else // if dense or zeros
{
bli_get_range_l2r( thr, all_start, all_end, block_factor,
start, end );
area = bli_get_range_l2r( thr, a, bf,
start, end );
}
return area;
}
void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
if ( bli_is_upper_or_lower( uplo ) )
siz_t area;
// This function assigns area-weighted ranges in the n dimension
// where the total range spans 0 to n-1 with 0 at the right end and
// n-1 at the left end.
if ( bli_obj_intersects_diag( *a ) &&
bli_obj_is_upper_or_lower( *a ) )
{
//printf( "bli_get_range_weighted_r2l: is upper or lower\n" );
bli_toggle_uplo( uplo );
bli_get_range_weighted( thr, all_start, all_end, block_factor,
uplo, TRUE, start, end );
doff_t diagoff = bli_obj_diag_offset( *a );
uplo_t uplo = bli_obj_uplo( *a );
dim_t m = bli_obj_length( *a );
dim_t n = bli_obj_width( *a );
// Support implicit transposition.
if ( bli_obj_has_trans( *a ) )
{
bli_reflect_about_diag( diagoff, uplo, m, n );
}
bli_rotate180_trapezoid( diagoff, uplo );
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
TRUE, start, end );
}
else // if dense or zeros
{
//printf( "bli_get_range_weighted_r2l: is dense or zeros\n" );
bli_get_range_r2l( thr, all_start, all_end, block_factor,
start, end );
area = bli_get_range_r2l( thr, a, bf,
start, end );
}
return area;
}
void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
if ( bli_is_upper_or_lower( uplo ) )
siz_t area;
// This function assigns area-weighted ranges in the m dimension
// where the total range spans 0 to m-1 with 0 at the top end and
// m-1 at the bottom end.
if ( bli_obj_intersects_diag( *a ) &&
bli_obj_is_upper_or_lower( *a ) )
{
bli_toggle_uplo( uplo );
bli_get_range_weighted( thr, all_start, all_end, block_factor,
uplo, FALSE, start, end );
doff_t diagoff = bli_obj_diag_offset( *a );
uplo_t uplo = bli_obj_uplo( *a );
dim_t m = bli_obj_length( *a );
dim_t n = bli_obj_width( *a );
// Support implicit transposition.
if ( bli_obj_has_trans( *a ) )
{
bli_reflect_about_diag( diagoff, uplo, m, n );
}
bli_reflect_about_diag( diagoff, uplo, m, n );
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
FALSE, start, end );
}
else // if dense or zeros
{
bli_get_range_t2b( thr, all_start, all_end, block_factor,
start, end );
area = bli_get_range_t2b( thr, a, bf,
start, end );
}
return area;
}
void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end, dim_t block_factor, uplo_t uplo, dim_t* start, dim_t* end )
siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end )
{
if ( bli_is_upper_or_lower( uplo ) )
siz_t area;
// This function assigns area-weighted ranges in the m dimension
// where the total range spans 0 to m-1 with 0 at the bottom end and
// m-1 at the top end.
if ( bli_obj_intersects_diag( *a ) &&
bli_obj_is_upper_or_lower( *a ) )
{
bli_get_range_weighted( thr, all_start, all_end, block_factor,
uplo, TRUE, start, end );
doff_t diagoff = bli_obj_diag_offset( *a );
uplo_t uplo = bli_obj_uplo( *a );
dim_t m = bli_obj_length( *a );
dim_t n = bli_obj_width( *a );
// Support implicit transposition.
if ( bli_obj_has_trans( *a ) )
{
bli_reflect_about_diag( diagoff, uplo, m, n );
}
bli_reflect_about_diag( diagoff, uplo, m, n );
bli_rotate180_trapezoid( diagoff, uplo );
area = bli_get_range_weighted( thr, diagoff, uplo, m, n, bf,
TRUE, start, end );
}
else // if dense or zeros
{
bli_get_range_b2t( thr, all_start, all_end, block_factor,
start, end );
area = bli_get_range_b2t( thr, a, bf,
start, end );
}
return area;
}

View File

@@ -101,13 +101,25 @@ void bli_barrier( thread_comm_t* communicator, dim_t thread_id );
struct thrinfo_s
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
// The thread communicator for the other threads sharing the same work
// at this level.
thread_comm_t* ocomm;
dim_t n_way; //Number of distinct used to parallelize the loop
dim_t work_id; //What we're working on
// Our thread id within the ocomm thread communicator.
dim_t ocomm_id;
// The thread communicator for the other threads sharing the same work
// at this level.
thread_comm_t* icomm;
// Our thread id within the icomm thread communicator.
dim_t icomm_id;
// The number of distinct threads used to parallelize the loop.
dim_t n_way;
// What we're working on.
dim_t work_id;
};
typedef struct thrinfo_s thrinfo_t;
@@ -128,39 +140,37 @@ typedef struct thrinfo_s thrinfo_t;
#define thread_obarrier( thread ) bli_barrier( thread->ocomm, thread->ocomm_id )
#define thread_ibarrier( thread ) bli_barrier( thread->icomm, thread->icomm_id )
void bli_get_range( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor,
bool_t handle_edge_low,
void bli_get_range( void* thr, dim_t n, dim_t bf, bool_t handle_edge_low,
dim_t* start, dim_t* end );
void bli_get_range_l2r( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor,
dim_t* start, dim_t* end );
void bli_get_range_r2l( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor,
dim_t* start, dim_t* end );
void bli_get_range_t2b( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor,
dim_t* start, dim_t* end );
void bli_get_range_b2t( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor,
dim_t* start, dim_t* end );
siz_t bli_get_range_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
void bli_get_range_weighted( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor, uplo_t uplo,
bool_t handle_edge_low,
dim_t* start, dim_t* end );
void bli_get_range_weighted_l2r( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor, uplo_t uplo,
dim_t* start, dim_t* end );
void bli_get_range_weighted_r2l( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor, uplo_t uplo,
dim_t* start, dim_t* end );
void bli_get_range_weighted_t2b( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor, uplo_t uplo,
dim_t* start, dim_t* end );
void bli_get_range_weighted_b2t( void* thr, dim_t all_start, dim_t all_end,
dim_t block_factor, uplo_t uplo,
dim_t* start, dim_t* end );
dim_t bli_get_range_width_l( doff_t diagoff_j,
dim_t m,
dim_t n_j,
dim_t j,
dim_t n_way,
dim_t bf,
dim_t bf_left,
double area_per_thr,
bool_t handle_edge_low );
siz_t bli_find_area_trap_l( dim_t m, dim_t n, doff_t diagoff );
siz_t bli_get_range_weighted( void* thr,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* j_start_thr,
dim_t* j_end_thr );
siz_t bli_get_range_weighted_l2r( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_weighted_r2l( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_weighted_t2b( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
siz_t bli_get_range_weighted_b2t( void* thr, obj_t* a, dim_t bf, dim_t* start, dim_t* end );
thrinfo_t* bli_create_thread_info( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,

View File

@@ -156,8 +156,8 @@
#define bli_obj_is_upper_or_lower( obj ) \
\
( ( (obj).info & BLIS_UPLO_BITS ) == BLIS_BITVAL_UPPER || \
( (obj).info & BLIS_UPLO_BITS ) == BLIS_BITVAL_LOWER )
( bli_obj_is_upper( obj ) || \
bli_obj_is_lower( obj ) )
#define bli_obj_is_dense( obj ) \
\
@@ -441,11 +441,15 @@
#define bli_obj_length( obj ) \
\
((obj).m)
( (obj).dim[BLIS_M] )
#define bli_obj_width( obj ) \
\
((obj).n)
( (obj).dim[BLIS_N] )
#define bli_obj_dim( mdim, obj ) \
\
( (obj).dim[mdim] )
#define bli_obj_min_dim( obj ) \
\
@@ -579,23 +583,38 @@ bli_obj_width_stored( obj )
// Dimension modification
#define bli_obj_set_length( dim_m, obj ) \
{ \
(obj).dim[BLIS_M] = dim_m; \
}
#define bli_obj_set_width( dim_n, obj ) \
{ \
(obj).dim[BLIS_N] = dim_n; \
}
#define bli_obj_set_dim( mdim, dim_val, obj ) \
{ \
(obj).dim[mdim] = dim_val; \
}
#define bli_obj_set_dims( dim_m, dim_n, obj ) \
{ \
(obj).m = dim_m; \
(obj).n = dim_n; \
bli_obj_set_length( dim_m, obj ); \
bli_obj_set_width( dim_n, obj ); \
}
#define bli_obj_set_dims_with_trans( trans, dim_m, dim_n, obj ) \
{ \
if ( bli_does_notrans( trans ) ) \
{ \
(obj).m = dim_m; \
(obj).n = dim_n; \
bli_obj_set_length( dim_m, obj ); \
bli_obj_set_width( dim_n, obj ); \
} \
else \
{ \
(obj).m = dim_n; \
(obj).n = dim_m; \
bli_obj_set_length( dim_n, obj ); \
bli_obj_set_width( dim_m, obj ); \
} \
}
@@ -604,15 +623,15 @@ bli_obj_width_stored( obj )
#define bli_obj_row_stride( obj ) \
\
((obj).rs)
( (obj).rs )
#define bli_obj_col_stride( obj ) \
\
((obj).cs)
( (obj).cs )
#define bli_obj_imag_stride( obj ) \
\
((obj).is)
( (obj).is )
#define bli_obj_row_stride_mag( obj ) \
\
@@ -671,41 +690,60 @@ bli_obj_width_stored( obj )
// Offset query
#define bli_obj_row_offset( obj ) \
#define bli_obj_row_off( obj ) \
\
( (obj).offm )
( (obj).off[BLIS_M] )
#define bli_obj_col_offset( obj ) \
#define bli_obj_col_off( obj ) \
\
( (obj).offn )
( (obj).off[BLIS_N] )
#define bli_obj_off( mdim, obj ) \
\
( (obj).off[mdim] )
// Offset modification
#define bli_obj_set_off( mdim, offset, obj ) \
{ \
(obj).off[mdim] = offset; \
}
#define bli_obj_set_offs( offset_m, offset_n, obj ) \
{ \
(obj).offm = offset_m; \
(obj).offn = offset_n; \
bli_obj_set_off( BLIS_M, offset_m, obj ); \
bli_obj_set_off( BLIS_N, offset_n, obj ); \
}
#define bli_obj_inc_off( mdim, offset, obj ) \
{ \
(obj).off[mdim] += offset; \
}
#define bli_obj_inc_offm( offset, obj ) \
{ \
bli_obj_inc_off( BLIS_M, offset, obj ); \
}
#define bli_obj_inc_offn( offset, obj ) \
{ \
bli_obj_inc_off( BLIS_N, offset, obj ); \
}
#define bli_obj_inc_offs( offset_m, offset_n, obj ) \
{ \
(obj).offm += offset_m; \
(obj).offn += offset_n; \
bli_obj_inc_off( BLIS_M, offset_m, obj ); \
bli_obj_inc_off( BLIS_N, offset_n, obj ); \
}
#define bli_obj_dec_offs( offset_m, offset_n, obj ) \
{ \
(obj).offm -= offset_m; \
(obj).offn -= offset_n; \
}
// Diagonal offset query
#define bli_obj_diag_offset( obj ) \
\
((obj).diag_off)
( (obj).diag_off )
#define bli_obj_diag_offset_after_trans( obj ) \
\
@@ -762,7 +800,7 @@ bli_obj_width_stored( obj )
#define bli_obj_buffer( obj ) \
\
(obj).buffer
( (obj).buffer )
// Buffer address modification
@@ -776,7 +814,7 @@ bli_obj_width_stored( obj )
#define bli_obj_internal_scalar_buffer( obj ) \
\
&((obj).scalar)
&( (obj).scalar )
// Bufferless scalar field modification
@@ -794,7 +832,7 @@ bli_obj_width_stored( obj )
#define bli_obj_elem_size( obj ) \
\
(obj).elem_size \
( (obj).elem_size )
// Element size modification
@@ -851,19 +889,19 @@ bli_obj_width_stored( obj )
#define bli_obj_panel_length( obj ) \
\
((obj).m_panel)
( (obj).m_panel )
#define bli_obj_panel_width( obj ) \
\
((obj).n_panel)
( (obj).n_panel )
#define bli_obj_panel_dim( obj ) \
\
((obj).pd)
( (obj).pd )
#define bli_obj_panel_stride( obj ) \
\
((obj).ps)
( (obj).ps )
// Packed panel info modification
@@ -969,15 +1007,19 @@ bli_obj_width_stored( obj )
#define bli_obj_buffer_for_const( dt, obj ) \
\
( void* )( \
( ( char* )( (obj).buffer ) ) + dt * BLIS_CONSTANT_SLOT_SIZE \
( ( char* )( bli_obj_buffer( obj ) ) ) + \
( dim_t )( dt * BLIS_CONSTANT_SLOT_SIZE ) \
)
#define bli_obj_buffer_at_off( obj ) \
\
( void* )( \
( ( char* )( (obj).buffer ) ) + ( dim_t )(obj).elem_size * \
( (obj).offn * (obj).cs + \
(obj).offm * (obj).rs ) \
( ( char* )( bli_obj_buffer ( obj ) ) + \
( dim_t )( bli_obj_elem_size( obj ) ) * \
( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + \
bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) \
) \
) \
)
#define bli_obj_buffer_for_1x1( dt, obj ) \
@@ -1015,8 +1057,8 @@ bli_obj_width_stored( obj )
dim_t n = bli_obj_width( obj ); \
inc_t rs = bli_obj_row_stride( obj ); \
inc_t cs = bli_obj_col_stride( obj ); \
dim_t offm = bli_obj_row_offset( obj ); \
dim_t offn = bli_obj_col_offset( obj ); \
dim_t offm = bli_obj_row_off( obj ); \
dim_t offn = bli_obj_col_off( obj ); \
doff_t diag_off = bli_obj_diag_offset( obj ); \
\
bli_obj_set_dims( n, m, obj ); \
@@ -1047,8 +1089,8 @@ bli_obj_width_stored( obj )
{ \
dim_t m = bli_obj_length( obj ); \
dim_t n = bli_obj_width( obj ); \
dim_t offm = bli_obj_row_offset( obj ); \
dim_t offn = bli_obj_col_offset( obj ); \
dim_t offm = bli_obj_row_off( obj ); \
dim_t offn = bli_obj_col_off( obj ); \
doff_t diag_off = bli_obj_diag_offset( obj ); \
\
bli_obj_set_dims( n, m, obj ); \

View File

@@ -144,7 +144,8 @@
#define bli_is_upper_or_lower( uplo ) \
\
( bli_is_upper( uplo ) || bli_is_lower( uplo ) )
( bli_is_upper( uplo ) || \
bli_is_lower( uplo ) )
#define bli_is_dense( uplo ) \
\
@@ -470,6 +471,106 @@
( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) )
// pruning-related
#define bli_prune_unstored_region_top_l( diagoff, m, n, offm_inc ) \
{ \
offm_inc = 0; \
\
/* If the diagonal intersects the left side of the matrix,
ignore the area above that intersection. */ \
if ( diagoff < 0 ) \
{ \
m = m + diagoff; \
offm_inc = - diagoff; \
diagoff = 0; \
} \
}
#define bli_prune_unstored_region_right_l( diagoff, m, n, offn_inc ) \
{ \
offn_inc = 0; \
\
/* If the diagonal intersects the bottom side of the matrix,
ignore the area to the right of that intersection. */ \
if ( n > diagoff + m ) \
{ \
n = diagoff + m; \
} \
}
#define bli_prune_unstored_region_left_u( diagoff, m, n, offn_inc ) \
{ \
offn_inc = 0; \
\
/* If the diagonal intersects the top side of the matrix,
ignore the area to the left of that intersection. */ \
if ( diagoff > 0 ) \
{ \
n = n - diagoff; \
offn_inc = + diagoff; \
diagoff = 0; \
} \
}
#define bli_prune_unstored_region_bottom_u( diagoff, m, n, offm_inc ) \
{ \
offm_inc = 0; \
\
/* If the diagonal intersects the right side of the matrix,
ignore the area below that intersection. */ \
if ( m > -diagoff + n ) \
{ \
m = -diagoff + n; \
} \
}
// thread range-related
#define bli_rotate180_trapezoid( diagoff, uplo ) \
{ \
diagoff = n - diagoff - m; \
bli_toggle_uplo( uplo ); \
}
#define bli_reverse_index_direction( start, end, n ) \
{ \
dim_t start2 = n - start; \
dim_t end2 = n - end; \
start = end2; \
end = start2; \
}
#define bli_reflect_about_diag( diagoff, uplo, m, n ) \
{ \
bli_swap_dims( m, n ); \
bli_negate_diag_offset( diagoff ); \
bli_toggle_uplo( uplo ); \
}
// mdim_t-related
#define bli_is_m_dim( mdim ) \
\
( mdim == BLIS_M )
#define bli_is_n_dim( mdim ) \
\
( mdim == BLIS_N )
#define bli_dim_toggled( mdim ) \
\
( mdim == BLIS_M ? BLIS_N : BLIS_M )
#define bli_toggle_dim( mdim ) \
{ \
mdim = bli_dim_toggled( mdim ); \
}
// index-related
#define bli_is_edge_f( i1, iter, left ) \

View File

@@ -243,6 +243,22 @@
bli_fmax( bli_fabs( a ), \
bli_fabs( b ) )
// round
#define bli_round( val ) \
\
( round( val ) )
// round_to_mult
#define bli_round_to_mult( val, mult ) \
\
( guint_t )( ( ( ( guint_t )val + \
( guint_t )mult / 2 \
) / mult \
) * mult \
)
// isnan, isinf
#define bli_isinf( a ) isinf( a )

View File

@@ -591,10 +591,8 @@ typedef struct obj_s
// Basic fields
struct obj_s* root;
dim_t offm;
dim_t offn;
dim_t m;
dim_t n;
dim_t off[2];
dim_t dim[2];
doff_t diag_off;
objbits_t info;
@@ -626,10 +624,10 @@ typedef struct obj_s
{ \
(b).root = (a).root; \
\
(b).offm = (a).offm; \
(b).offn = (a).offn; \
(b).m = (a).m; \
(b).n = (a).n; \
(b).off[0] = (a).off[0]; \
(b).off[1] = (a).off[1]; \
(b).dim[0] = (a).dim[0]; \
(b).dim[1] = (a).dim[1]; \
(b).diag_off = (a).diag_off; \
\
(b).info = (a).info; \
@@ -669,8 +667,8 @@ typedef struct obj_s
{ \
(b).root = (a).root; \
\
(b).offm = (a).offm; \
(b).offn = (a).offn; \
(b).off[0] = (a).off[0]; \
(b).off[1] = (a).off[1]; \
/* Avoid copying m since it will be overwritten. */ \
/* Avoid copying n since it will be overwritten. */ \
(b).diag_off = (a).diag_off; \
@@ -727,6 +725,15 @@ typedef enum
} subpart_t;
// -- Matrix dimension type --
typedef enum
{
BLIS_M = 0,
BLIS_N = 1
} mdim_t;
// -- Machine parameter types --
typedef enum

View File

@@ -113,6 +113,7 @@ extern "C" {
#include "bli_pool.h"
#include "bli_mem.h"
#include "bli_part.h"
#include "bli_prune.h"
#include "bli_query.h"
#include "bli_blocksize.h"
#include "bli_func.h"

203
test/thread_ranges/Makefile Normal file
View File

@@ -0,0 +1,203 @@
#!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas at Austin nor the names
# of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
#
# Makefile
#
# Field G. Van Zee
#
# Makefile for standalone BLIS test drivers.
#
#
# --- Makefile PHONY target definitions ----------------------------------------
#
.PHONY: all \
test-ranges \
clean cleanx
#
# --- Makefile initialization --------------------------------------------------
#
# Define the name of the configuration file.
CONFIG_MK_FILE := config.mk
# Define the name of the file containing build and architecture-specific
# makefile definitions.
MAKE_DEFS_FILE := make_defs.mk
# Locations of important files.
ROOT_PATH := ../..
CONFIG_DIR := config
#
# --- Include makefile configuration file --------------------------------------
#
# Construct the path to the makefile configuration file that was generated by
# the configure script.
CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE)
# Include the configuration file.
-include $(CONFIG_MK_PATH)
# Detect whether we actually got the configuration file. If we didn't, then
# it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes)
CONFIG_MK_PRESENT := yes
else
CONFIG_MK_PRESENT := no
endif
# Now we have access to CONFIG_NAME, which tells us which sub-directory of the
# config directory to use as our configuration.
CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
#
# --- Include makefile definitions file ----------------------------------------
#
# Construct the path to the makefile definitions file residing inside of
# the configuration sub-directory.
MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE)
# Include the makefile definitions file.
-include $(MAKE_DEFS_MK_PATH)
# Detect whether we actually got the make definitios file. If we didn't, then
# it is likely that the configuration is invalid (or incomplete).
ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes)
MAKE_DEFS_MK_PRESENT := yes
else
MAKE_DEFS_MK_PRESENT := no
endif
#
# --- BLAS and LAPACK implementations ------------------------------------------
#
# BLIS library and header path. This is simply wherever it was installed.
BLIS_LIB_PATH := $(INSTALL_PREFIX)/lib
BLIS_INC_PATH := $(INSTALL_PREFIX)/include/blis
# BLIS library.
BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
#
# --- General build definitions ------------------------------------------------
#
TEST_SRC_PATH := .
TEST_OBJ_PATH := .
# Gather all local object files.
TEST_OBJS := $(patsubst $(TEST_SRC_PATH)/%.c, \
$(TEST_OBJ_PATH)/%.o, \
$(wildcard $(TEST_SRC_PATH)/*.c))
# Override CFLAGS from make_defs.mk here, if desired.
#CFLAGS := -g -O2 -march=native
# Add installed and local header paths to CFLAGS
CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH)
LINKER := $(CC)
LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64
LDFLAGS += -lgfortran -lm -lpthread -fopenmp
# Datatype
DT_S := -DDT=BLIS_FLOAT
DT_D := -DDT=BLIS_DOUBLE
DT_C := -DDT=BLIS_SCOMPLEX
DT_Z := -DDT=BLIS_DCOMPLEX
# Problem size specification
PDEF_MT := -DP_BEGIN=400 \
-DP_END=8000 \
-DP_INC=400
#
# --- Targets/rules ------------------------------------------------------------
#
all: test-ranges
test-ranges: \
test_ranges.x
# --Object file rules --
$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
$(CC) $(CFLAGS) -c $< -o $@
# blis asm
test_%.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) -c $< -o $@
# -- Executable file rules --
# NOTE: For the BLAS test drivers, we place the BLAS libraries before BLIS
# on the link command line in case BLIS was configured with the BLAS
# compatibility layer. This prevents BLIS from inadvertently getting called
# for the BLAS routines we are trying to test with.
test_ranges.x: test_ranges.o $(BLIS_LIB)
$(LINKER) $< $(BLIS_LIB) $(LDFLAGS) -o $@
# -- Clean rules --
clean: cleanx
cleanx:
- $(RM_F) *.o *.x

View File

@@ -0,0 +1,313 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
//#define PRINT
int main( int argc, char** argv )
{
bli_init();
#if 0
obj_t a, b, c;
obj_t aa, bb, cc;
dim_t m, n, k;
num_t dt;
uplo_t uploa, uplob, uploc;
{
dt = BLIS_DOUBLE;
m = 6;
k = 6;
n = 6;
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, k, n, 0, 0, &b );
bli_obj_create( dt, m, n, 0, 0, &c );
uploa = BLIS_UPPER;
uploa = BLIS_LOWER;
bli_obj_set_struc( BLIS_TRIANGULAR, a );
bli_obj_set_uplo( uploa, a );
bli_obj_set_diag_offset( -2, a );
uplob = BLIS_UPPER;
uplob = BLIS_LOWER;
bli_obj_set_struc( BLIS_TRIANGULAR, b );
bli_obj_set_uplo( uplob, b );
bli_obj_set_diag_offset( -2, b );
uploc = BLIS_UPPER;
//uploc = BLIS_LOWER;
//uploc = BLIS_ZEROS;
//uploc = BLIS_DENSE;
bli_obj_set_struc( BLIS_HERMITIAN, c );
//bli_obj_set_struc( BLIS_TRIANGULAR, c );
bli_obj_set_uplo( uploc, c );
bli_obj_set_diag_offset( 1, c );
bli_obj_alias_to( a, aa ); (void)aa;
bli_obj_alias_to( b, bb ); (void)bb;
bli_obj_alias_to( c, cc ); (void)cc;
bli_randm( &a );
bli_randm( &b );
bli_randm( &c );
//bli_mkherm( &a );
//bli_mktrim( &a );
bli_prune_unref_mparts( &cc, BLIS_M,
&aa, BLIS_N );
bli_printm( "c orig", &c, "%4.1f", "" );
bli_printm( "c alias", &cc, "%4.1f", "" );
bli_printm( "a orig", &a, "%4.1f", "" );
bli_printm( "a alias", &aa, "%4.1f", "" );
//bli_obj_print( "a struct", &a );
}
#endif
dim_t p_begin, p_max, p_inc;
gint_t m_input, n_input;
char uploa_ch;
doff_t diagoffa;
dim_t bf;
dim_t n_way;
char part_dim_ch;
bool_t go_fwd;
char out_ch;
obj_t a;
thrinfo_t thrinfo;
dim_t m, n;
uplo_t uploa;
bool_t part_m_dim, part_n_dim;
bool_t go_bwd;
dim_t p;
num_t dt;
dim_t start, end;
dim_t width;
siz_t area;
gint_t t_begin, t_stop, t_inc;
dim_t t;
if ( argc == 13 )
{
sscanf( argv[1], "%lu", &p_begin );
sscanf( argv[2], "%lu", &p_max );
sscanf( argv[3], "%lu", &p_inc );
sscanf( argv[4], "%ld", &m_input );
sscanf( argv[5], "%ld", &n_input );
sscanf( argv[6], "%c", &uploa_ch );
sscanf( argv[7], "%ld", &diagoffa );
sscanf( argv[8], "%lu", &bf );
sscanf( argv[9], "%lu", &n_way );
sscanf( argv[10], "%c", &part_dim_ch );
sscanf( argv[11], "%lu", &go_fwd );
sscanf( argv[12], "%c", &out_ch );
}
else
{
printf( "\n" );
printf( " %s\n", argv[0] );
printf( "\n" );
printf( " Simulate the dimension ranges assigned to threads when\n" );
printf( " partitioning a matrix for parallelism in BLIS.\n" );
printf( "\n" );
printf( " Usage:\n" );
printf( "\n" );
printf( " %s p_beg p_max p_inc m n uplo doff bf n_way part_dim go_fwd out\n", argv[0] );
printf( "\n" );
printf( " p_beg: the first problem size p to test.\n" );
printf( " p_max: the maximum problem size p to test.\n" );
printf( " p_inc: the increase in problem size p between tests.\n" );
printf( " m: the m dimension:\n" );
printf( " n: the n dimension:\n" );
printf( " if m,n = -1: bind m,n to problem size p.\n" );
printf( " if m,n = 0: bind m,n to p_max.\n" );
printf( " if m,n > 0: hold m,n = c constant for all p.\n" );
printf( " uplo: the uplo field of the matrix being partitioned:\n" );
printf( " 'l': lower-stored (BLIS_LOWER)\n" );
printf( " 'u': upper-stored (BLIS_UPPER)\n" );
printf( " 'd': densely-stored (BLIS_DENSE)\n" );
printf( " doff: the diagonal offset of the matrix being partitioned.\n" );
printf( " bf: the simulated blocking factor. all thread ranges must\n" );
printf( " be a multiple of bf, except for the range that contains\n" );
printf( " the edge case (if one exists). the blocking factor\n" );
printf( " would typically correspond to a register blocksize.\n" );
printf( " n_way: the number of ways of parallelism for which we are\n" );
printf( " partitioning (i.e.: the number of threads, or thread\n" );
printf( " groups).\n" );
printf( " part_dim: the dimension to partition:\n" );
printf( " 'm': partition the m dimension.\n" );
printf( " 'n': partition the n dimension.\n" );
printf( " go_fwd: the direction to partition:\n" );
printf( " '1': forward, e.g. left-to-right (part_dim = 'm') or\n" );
printf( " top-to-bottom (part_dim = 'n')\n" );
printf( " '0': backward, e.g. right-to-left (part_dim = 'm') or\n" );
printf( " bottom-to-top (part_dim = 'n')\n" );
printf( " NOTE: reversing the direction does not change the\n" );
printf( " subpartitions' widths, but it does change which end of\n" );
printf( " the index range receives the edge case, if it exists.\n" );
printf( " out: the type of output per thread-column:\n" );
printf( " 'w': the width (and area) of the thread's subpartition\n" );
printf( " 'r': the actual ranges of the thread's subpartition\n" );
printf( " where the start and end points of each range are\n" );
printf( " inclusive and exclusive, respectively.\n" );
printf( "\n" );
exit(1);
}
if ( m_input == 0 ) m_input = p_max;
if ( n_input == 0 ) n_input = p_max;
if ( part_dim_ch == 'm' ) { part_m_dim = TRUE; part_n_dim = FALSE; }
else { part_m_dim = FALSE; part_n_dim = TRUE; }
go_bwd = !go_fwd;
if ( uploa_ch == 'l' ) uploa = BLIS_LOWER;
else if ( uploa_ch == 'u' ) uploa = BLIS_UPPER;
else uploa = BLIS_DENSE;
if ( part_n_dim )
{
if ( bli_is_upper( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; }
else /* if lower or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; }
}
else // if ( part_m_dim )
{
if ( bli_is_lower( uploa ) ) { t_begin = n_way-1; t_stop = -1; t_inc = -1; }
else /* if upper or dense */ { t_begin = 0; t_stop = n_way; t_inc = 1; }
}
printf( "\n" );
printf( " part: %3s doff: %3ld bf: %3ld output: %s\n",
( part_n_dim ? ( go_fwd ? "l2r" : "r2l" )
: ( go_fwd ? "t2b" : "b2t" ) ),
diagoffa, bf,
( out_ch == 'w' ? "width(area)" : "ranges" ) );
printf( " uplo: %3c nt: %3ld\n", uploa_ch, n_way );
printf( "\n" );
printf( " " );
for ( t = t_begin; t != t_stop; t += t_inc )
{
if ( part_n_dim )
{
if ( t == t_begin ) printf( "left... " );
else if ( t == t_stop-t_inc ) printf( " ...right" );
else printf( " " );
}
else // if ( part_m_dim )
{
if ( t == t_begin ) printf( "top... " );
else if ( t == t_stop-t_inc ) printf( " ...bottom" );
else printf( " " );
}
}
printf( "\n" );
printf( "%4c x %4c ", 'm', 'n' );
for ( t = t_begin; t != t_stop; t += t_inc )
{
printf( "%9s %lu ", "thread", t );
}
printf( "\n" );
printf( "-------------" );
for ( t = t_begin; t != t_stop; t += t_inc )
{
printf( "-------------" );
}
printf( "\n" );
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = ( dim_t )p;
else m = ( dim_t )m_input;
if ( n_input < 0 ) n = ( dim_t )p;
else n = ( dim_t )n_input;
dt = BLIS_DOUBLE;
bli_obj_create( dt, m, n, 0, 0, &a );
bli_obj_set_struc( BLIS_TRIANGULAR, a );
bli_obj_set_uplo( uploa, a );
bli_obj_set_diag_offset( diagoffa, a );
bli_randm( &a );
printf( "%4lu x %4lu ", m, n );
for ( t = t_begin; t != t_stop; t += t_inc )
{
thrinfo.n_way = n_way;
thrinfo.work_id = t;
if ( part_n_dim && go_fwd )
area = bli_get_range_weighted_l2r( &thrinfo, &a, bf, &start, &end );
else if ( part_n_dim && go_bwd )
area = bli_get_range_weighted_r2l( &thrinfo, &a, bf, &start, &end );
else if ( part_m_dim && go_fwd )
area = bli_get_range_weighted_t2b( &thrinfo, &a, bf, &start, &end );
else // ( part_m_dim && go_bwd )
area = bli_get_range_weighted_b2t( &thrinfo, &a, bf, &start, &end );
width = end - start;
if ( out_ch == 'w' ) printf( "%4lu(%6lu) ", width, area );
else printf( "[%4lu,%4lu) ", start, end );
}
printf( "\n" );
bli_obj_free( &a );
}
bli_finalize();
return 0;
}