mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Made framework changes to initialize specific cache block sizes for TRSM.
Details: -This commit addresses the performance optimization(single-thread and multi-thread) for DTRSM on zen2. -This new optimization employs different MC, KC & NC values for TRSM than what is being used in other Level-3 routines like DGEMM. -Changed TRSM framework code to choose these blocksizes for TRSM on zen family configurations. -Added a new field called "trsm_blkszs" to cntx structure in order to store TRSM specific block sizes. -Implemented routines to initialize, set and query the TRSM-specific block sizes. -Defined a new macro "AOCL_BLIS_ZEN" in configure script. This macro is automatically defined for zen family architectures. It enables us to choose different cache block sizes for TRSM instead of common level-3 block sizes. Change-Id: Id8557b1c962a316b1edecca9cd582675eaf35fe6 Signed-off-by: Meghana Vankadari <meghana.vankadari@amd.com> AMD-Internal: [CPUPL-656]
This commit is contained in:
@@ -45,6 +45,12 @@
|
||||
// Enabled kernel sets (kernel_list)
|
||||
@kernel_list_defines@
|
||||
|
||||
//This macro is enabled only for ZEN family configurations.
|
||||
//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes.
|
||||
#if @enable_aocl_zen@
|
||||
#define AOCL_BLIS_ZEN
|
||||
#endif
|
||||
|
||||
#if @enable_openmp@
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
#endif
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -168,6 +168,20 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for level-3 TRSM execution.
|
||||
bli_cntx_set_trsm_blkszs
|
||||
(
|
||||
5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -137,8 +137,28 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
);
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
//Initialize TRSM blocksize objects with architecture-specific values.
|
||||
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
//Tuning is done for double-precision only.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for level-3 TRSM problems.
|
||||
bli_cntx_set_trsm_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values. s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 100, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 120, -1, -1 );
|
||||
|
||||
15
configure
vendored
15
configure
vendored
@@ -5,7 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
# Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -3016,6 +3016,18 @@ main()
|
||||
uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
|
||||
config_name_define="#define BLIS_FAMILY_${uconf}\n"
|
||||
|
||||
#create a AOCL specific #define
|
||||
#This macro is enabled only for zen family configurations.
|
||||
#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
uconf=$(echo ${config_name} | grep -c 'zen' | cut -d. -f1)
|
||||
if [[ $uconf == 1 ]]; then
|
||||
enable_aocl_zen='yes'
|
||||
enable_aocl_zen_01=1
|
||||
else
|
||||
enable_aocl_zen = 'no';
|
||||
enable_aocl_zen_01=0;
|
||||
fi
|
||||
|
||||
# Create a list of #defines, one for each configuration in config_list.
|
||||
config_list_defines=""
|
||||
for conf in ${config_list}; do
|
||||
@@ -3126,6 +3138,7 @@ main()
|
||||
| perl -pe "s/\@config_name_define\@/${config_name_define}/g" \
|
||||
| perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \
|
||||
| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
|
||||
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen_01}/g" \
|
||||
| sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \
|
||||
| sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
|
||||
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
/*
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,7 +35,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
dim_t bli_l3_determine_kc
|
||||
(
|
||||
dir_t direct,
|
||||
@@ -311,7 +311,7 @@ dim_t PASTEMAC0(opname) \
|
||||
/* Extract the execution datatype and use it to query the corresponding
|
||||
blocksize and blocksize maximum values from the blksz_t object. */ \
|
||||
dt = bli_obj_exec_dt( a ); \
|
||||
bsize = bli_cntx_get_blksz( bszid, cntx ); \
|
||||
bsize = TRSM_BLKSZ_FUNC( bszid, cntx ); \
|
||||
b_alg = bli_blksz_get_def( dt, bsize ); \
|
||||
b_max = bli_blksz_get_max( dt, bsize ); \
|
||||
\
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -80,9 +80,15 @@ void bli_trsm_blk_var1
|
||||
{
|
||||
obj_t a11_1, c1_1;
|
||||
|
||||
//For zen architectures, TRSM uses different MC, KC and NC blocking sizes than other Level-3 routines.
|
||||
//Hence calling a different function to query TRSM-specific block sizes for zen family.
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, &a11,
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
#else
|
||||
b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
#endif
|
||||
// Acquire partitions for A1 and C1.
|
||||
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
|
||||
i, b_alg, &a11, &a11_1 );
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -67,8 +67,15 @@ void bli_trsm_blk_var2
|
||||
for ( dim_t i = my_start; i < my_end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
//For zen family, TRSM uses different MC, KC and NC blocksizes than Level-3 routines.
|
||||
//Hence calling a different function to query TRSM-specific block sizes for zen family.
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, b,
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
#else
|
||||
b_alg = bli_determine_blocksize( direct, i, my_end, b,
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
#endif
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -297,6 +298,78 @@ dim_t bli_determine_blocksize_b
|
||||
return b_use;
|
||||
}
|
||||
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
|
||||
dim_t bli_determine_blocksize_trsm
|
||||
(
|
||||
dir_t direct,
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
if ( direct == BLIS_FWD )
|
||||
return bli_determine_blocksize_trsm_f( i, dim, obj, bszid, cntx );
|
||||
else
|
||||
return bli_determine_blocksize_trsm_b( i, dim, obj, bszid, cntx );
|
||||
}
|
||||
|
||||
dim_t bli_determine_blocksize_trsm_f
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
num_t dt;
|
||||
blksz_t* bsize;
|
||||
dim_t b_alg, b_max;
|
||||
dim_t b_use;
|
||||
|
||||
// Extract the execution datatype and use it to query the corresponding
|
||||
// blocksize and blocksize maximum values from the blksz_t object.
|
||||
dt = bli_obj_exec_dt( obj );
|
||||
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
|
||||
b_alg = bli_blksz_get_def( dt, bsize );
|
||||
b_max = bli_blksz_get_max( dt, bsize );
|
||||
|
||||
b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
|
||||
|
||||
return b_use;
|
||||
}
|
||||
|
||||
dim_t bli_determine_blocksize_trsm_b
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
num_t dt;
|
||||
blksz_t* bsize;
|
||||
dim_t b_alg, b_max;
|
||||
dim_t b_use;
|
||||
|
||||
// Extract the execution datatype and use it to query the corresponding
|
||||
// blocksize and blocksize maximum values from the blksz_t object.
|
||||
dt = bli_obj_exec_dt( obj );
|
||||
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
|
||||
b_alg = bli_blksz_get_def( dt, bsize );
|
||||
b_max = bli_blksz_get_max( dt, bsize );
|
||||
|
||||
b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
|
||||
|
||||
return b_use;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
dim_t bli_determine_blocksize_f_sub
|
||||
(
|
||||
dim_t i,
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -278,6 +279,38 @@ dim_t bli_determine_blocksize_b
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
|
||||
dim_t bli_determine_blocksize_trsm
|
||||
(
|
||||
dir_t direct,
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
dim_t bli_determine_blocksize_trsm_f
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
dim_t bli_determine_blocksize_trsm_b
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
obj_t* obj,
|
||||
bszid_t bszid,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
dim_t bli_determine_blocksize_f_sub
|
||||
(
|
||||
dim_t i,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -1261,6 +1261,111 @@ void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
|
||||
void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... )
|
||||
{
|
||||
// This function should be called from the bli_cntx_init_*() function for
|
||||
// zen family architectures to set TRSM blocksizes. It should be called after
|
||||
// bli_cntx_init_defaults() so that the context begins with default
|
||||
// blocksizes across all datatypes.
|
||||
|
||||
/* Example prototypes:
|
||||
|
||||
void bli_cntx_set_trsm_blkszs
|
||||
(
|
||||
dim_t n_bs,
|
||||
bszid_t bs0_id, blksz_t* blksz0,
|
||||
bszid_t bs1_id, blksz_t* blksz1,
|
||||
bszid_t bs2_id, blksz_t* blksz2,
|
||||
...
|
||||
cntx_t* cntx
|
||||
);
|
||||
*/
|
||||
|
||||
va_list args;
|
||||
dim_t i;
|
||||
|
||||
// Allocate some temporary local arrays.
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_blkszs(): " );
|
||||
#endif
|
||||
bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ) );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_blkszs(): " );
|
||||
#endif
|
||||
blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ) );
|
||||
|
||||
// -- Begin variable argument section --
|
||||
|
||||
// Initialize variable argument environment.
|
||||
va_start( args, n_bs );
|
||||
|
||||
// Process n_bs tuples.
|
||||
for ( i = 0; i < n_bs; ++i )
|
||||
{
|
||||
// Here, we query the variable argument list for:
|
||||
// - the bszid_t of the blocksize we're about to process,
|
||||
// - the address of the blksz_t object.
|
||||
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
|
||||
blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
|
||||
|
||||
// Store the values in our temporary arrays.
|
||||
bszids[ i ] = bs_id;
|
||||
blkszs[ i ] = blksz;
|
||||
}
|
||||
|
||||
// The last argument should be the context pointer.
|
||||
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
|
||||
|
||||
// Shutdown variable argument environment and clean up stack.
|
||||
va_end( args );
|
||||
|
||||
// -- End variable argument section --
|
||||
|
||||
// Query the context for the addresses of:
|
||||
// - the blocksize object array
|
||||
blksz_t* cntx_l3_trsm_blkszs = bli_cntx_trsm_blkszs_buf( cntx );
|
||||
|
||||
// Now that we have the context address, we want to copy the values
|
||||
// from the temporary buffers into the corresponding buffers in the
|
||||
// context. Notice that the blksz_t* pointers were saved, rather than
|
||||
// the objects themselves, but we copy the contents of the objects
|
||||
// when copying into the context.
|
||||
|
||||
// Process each blocksize id tuple provided.
|
||||
for ( i = 0; i < n_bs; ++i )
|
||||
{
|
||||
// Read the current blocksize id, blksz_t* pointer, blocksize
|
||||
// multiple id, and blocksize scalar.
|
||||
bszid_t bs_id = bszids[ i ];
|
||||
blksz_t* blksz = blkszs[ i ];
|
||||
|
||||
blksz_t* cntx_l3_trsm_blksz = &cntx_l3_trsm_blkszs[ bs_id ];
|
||||
|
||||
// Copy the blksz_t object contents into the appropriate
|
||||
// location within the context's blksz_t array.
|
||||
//cntx_trsm_blkszs[ bs_id ] = *blksz;
|
||||
//bli_blksz_copy( blksz, cntx_trsm_blksz );
|
||||
bli_blksz_copy_if_pos( blksz, cntx_l3_trsm_blksz );
|
||||
}
|
||||
|
||||
// Free the temporary local arrays.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_blkszs(): " );
|
||||
#endif
|
||||
bli_free_intl( blkszs );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_cntx_set_blkszs(): " );
|
||||
#endif
|
||||
bli_free_intl( bszids );
|
||||
}
|
||||
#endif
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_cntx_set_l1f_kers( dim_t n_kers, ... )
|
||||
{
|
||||
// This function can be called from the bli_cntx_init_*() function for
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -84,6 +84,10 @@ static bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->bmults;
|
||||
}
|
||||
static blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->trsm_blkszs;
|
||||
}
|
||||
static func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->l3_vir_ukrs;
|
||||
@@ -333,6 +337,16 @@ static blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
|
||||
return blksz;
|
||||
}
|
||||
|
||||
static blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx )
|
||||
{
|
||||
blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx );
|
||||
blksz_t* blksz = &blkszs[ bs_id ];
|
||||
|
||||
// Return the address of the blksz_t identified by bs_id.
|
||||
return blksz;
|
||||
|
||||
}
|
||||
|
||||
static dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
|
||||
{
|
||||
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
|
||||
@@ -735,6 +749,8 @@ BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_trsm_blkszs( dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, dim_t n_bs, ... );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -217,6 +218,11 @@
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef AOCL_BLIS_ZEN
|
||||
#define TRSM_BLKSZ_FUNC bli_cntx_get_trsm_blksz
|
||||
#else
|
||||
#define TRSM_BLKSZ_FUNC bli_cntx_get_blksz
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018-2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -1420,6 +1420,8 @@ typedef struct cntx_s
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
bszid_t bmults[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
blksz_t trsm_blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
|
||||
func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
|
||||
mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ];
|
||||
|
||||
Reference in New Issue
Block a user