mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Rebased amd-staging-milan-3.0 branch on master
-- Rebased on top of master commit # 6e522e5823
-- Updated merged code to remove duplicated code added by auto-merging
-- Updated merged code to rename bool_t type
-- Updated merged code to rename bli_thread_obarrier
-- Updated merged code to rename bli_thread_obroadcast
Change-Id: I39879f1ef3b42ecbe5808af3b559d88c36dbbf6c
AMD-Internal: [CPUPL-1067]
This commit is contained in:
committed by
Dipal M Zambare
parent
449ee37028
commit
392726d0e1
@@ -9683,7 +9683,7 @@ Date: Fri Feb 23 17:42:48 2018 -0600
|
||||
|
||||
CHANGELOG update (0.3.0)
|
||||
|
||||
commit 3defc7265c12cf85e9de2d7a1f243c5e090a6f9d (origin/master, origin/HEAD)
|
||||
commit 3defc7265c12cf85e9de2d7a1f243c5e090a6f9d
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 23 17:38:19 2018 -0600
|
||||
|
||||
@@ -9719,7 +9719,7 @@ Date: Fri Feb 23 16:33:32 2018 -0600
|
||||
contained. To remedy this situation, we now selectively use movss to
|
||||
load any element that could be the last element in the matrix.
|
||||
|
||||
commit 5112e1859e7f8888f5555eb7bc02bd9fab9b4442 (origin/rt, rt)
|
||||
commit 5112e1859e7f8888f5555eb7bc02bd9fab9b4442 (origin/rt)
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Fri Feb 23 14:31:26 2018 -0600
|
||||
|
||||
@@ -9951,7 +9951,7 @@ Date: Thu Jan 4 20:51:35 2018 -0600
|
||||
time hardware detection (when clang is selected).
|
||||
- Added some missing (but mostly-optional) quotes to configure script.
|
||||
|
||||
commit 5a7005dd44ed3174abbe360981e367fd41c99b4b (origin/amd, amd)
|
||||
commit 5a7005dd44ed3174abbe360981e367fd41c99b4b
|
||||
Merge: 7be88705 3bc99a96
|
||||
Author: Nisanth M P <nisanth.padinharepatt@amd.com>
|
||||
Date: Wed Jan 3 12:05:12 2018 +0530
|
||||
@@ -10000,7 +10000,7 @@ Date: Sat Dec 23 15:32:03 2017 -0600
|
||||
is used by the auto-detection script to printf() the name of the
|
||||
sub-configuration corresponding to the detected hardware.
|
||||
|
||||
commit 9804adfd405056ec332bb8e13d68c7b52bd3a6c1 (origin/selfinit, selfinit)
|
||||
commit 9804adfd405056ec332bb8e13d68c7b52bd3a6c1 (origin/selfinit)
|
||||
Author: Field G. Van Zee <field@cs.utexas.edu>
|
||||
Date: Thu Dec 21 19:22:57 2017 -0600
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2019, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -53,9 +53,7 @@ COPTFLAGS := -O3 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
# Flags specific to optimized kernels.
|
||||
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
|
||||
# they make explicit use of the rbp register.
|
||||
CKOPTFLAGS := $(COPTFLAGS) #-fomit-frame-pointer
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
|
||||
else
|
||||
|
||||
@@ -39,229 +39,242 @@
|
||||
|
||||
void bli_cntx_init_zen( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
blksz_t thresh[ BLIS_NUM_THRESH ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_zen_ref( cntx );
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_zen_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
8,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
cntx
|
||||
);
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
8,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
16,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
|
||||
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
|
||||
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
//set
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||
cntx
|
||||
);
|
||||
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
|
||||
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
|
||||
//set
|
||||
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
|
||||
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
|
||||
/*
|
||||
<<<<<<< HEAD
|
||||
Multi Instance performance improvement of DGEMM when binded to a CCX
|
||||
In Multi instance each thread runs a sequential DGEMM.
|
||||
|
||||
a) If BLIS is run in a multi-instance mode with
|
||||
CPU freq 2.6/2.2 Ghz
|
||||
DDR4 clock frequency 2400Mhz
|
||||
mc = 240, kc = 512, and nc = 2040
|
||||
has better performance on EPYC server, over the default block sizes.
|
||||
a) If BLIS is run in a multi-instance mode with
|
||||
CPU freq 2.6/2.2 Ghz
|
||||
DDR4 clock frequency 2400Mhz
|
||||
mc = 240, kc = 512, and nc = 2040
|
||||
has better performance on EPYC server, over the default block sizes.
|
||||
|
||||
b) If BLIS is run in Single Instance mode
|
||||
mc = 510, kc = 1024 and nc = 4080
|
||||
b) If BLIS is run in Single Instance mode
|
||||
mc = 510, kc = 1024 and nc = 4080
|
||||
*/
|
||||
|
||||
// Zen optmized level 3 cache block sizes
|
||||
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
|
||||
// Zen optmized level 3 cache block sizes
|
||||
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
|
||||
|
||||
#endif
|
||||
#else
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
|
||||
#endif
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for level-3 TRSM execution.
|
||||
bli_cntx_set_trsm_blkszs
|
||||
(
|
||||
5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for level-3 TRSM execution.
|
||||
bli_cntx_set_trsm_blkszs
|
||||
(
|
||||
5,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 220, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 );
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
(
|
||||
3,
|
||||
BLIS_MT, &thresh[ BLIS_MT ],
|
||||
BLIS_NT, &thresh[ BLIS_NT ],
|
||||
BLIS_KT, &thresh[ BLIS_KT ],
|
||||
cntx
|
||||
);
|
||||
// Initialize the context with the sup handlers.
|
||||
bli_cntx_set_l3_sup_handlers
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM, bli_gemmsup_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize the context with the sup handlers.
|
||||
bli_cntx_set_l3_sup_handlers
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM, bli_gemmsup_ref,
|
||||
cntx
|
||||
);
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
28,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
|
||||
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
|
||||
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
14,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
|
||||
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
cntx
|
||||
);
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
||||
9, 9, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
||||
9, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
bli_cntx_set_l3_sup_blkszs
|
||||
(
|
||||
5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ],
|
||||
BLIS_KC, &blkszs[ BLIS_KC ],
|
||||
BLIS_MC, &blkszs[ BLIS_MC ],
|
||||
BLIS_NR, &blkszs[ BLIS_NR ],
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
}
|
||||
@@ -31,12 +31,6 @@ To summarize: In order to observe multithreaded parallelism within a BLIS operat
|
||||
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
|
||||
2. Specify multithreading at runtime. This is also discussed [later on](docs/Multithreading.md#specifying-multithreading).
|
||||
|
||||
**IMPORTANT**: Multithreading in BLIS is disabled by default. Furthermore, even when multithreading is enabled, BLIS will default to single-threaded execution at runtime. In order to both *allow* and *invoke* parallelism from within BLIS operations, you must both *enable* multithreading at configure-time and *specify* multithreading at runtime.
|
||||
|
||||
To summarize: In order to observe multithreaded parallelism within a BLIS operation, you must do *both* of the following:
|
||||
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
|
||||
2. Specify multithreading at runtime. This is also dicussed [later on](docs/Multithreading.md#specifying-multithreading).
|
||||
|
||||
# Enabling multithreading
|
||||
|
||||
BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.
|
||||
|
||||
@@ -175,7 +175,7 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
\
|
||||
/* If the rntm is non-NULL, it may indicate that we should forgo sup
|
||||
handling altogether. */ \
|
||||
bool_t enable_sup = TRUE; \
|
||||
bool enable_sup = TRUE; \
|
||||
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
|
||||
\
|
||||
if ( enable_sup ) \
|
||||
|
||||
@@ -57,7 +57,7 @@ err_t bli_gemmsup_int
|
||||
#else
|
||||
#endif
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
@@ -88,25 +88,25 @@ err_t bli_gemmsup_int
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool_t auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
bool_t use_bp = TRUE;
|
||||
bool use_bp = TRUE;
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
@@ -247,3 +247,215 @@ err_t bli_gemmsup_int
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
err_t bli_gemmtsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
|
||||
|
||||
#if 0
|
||||
//bli_gemmsup_ref_var2
|
||||
//bli_gemmsup_ref_var1
|
||||
#if 0
|
||||
bli_gemmsup_ref_var1n
|
||||
#else
|
||||
#endif
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
if ( is_rrr_rrc_rcr_crr )
|
||||
{
|
||||
bli_gemmsup_ref_var2m
|
||||
(
|
||||
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_gemmsup_ref_var2m
|
||||
(
|
||||
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||
);
|
||||
}
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
#endif
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) {
|
||||
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
|
||||
const bool auto_factor = bli_rntm_auto_factor( rntm );
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
bool use_bp = TRUE;
|
||||
dim_t jc_new;
|
||||
dim_t ic_new;
|
||||
|
||||
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
const dim_t mu = n / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of nc up to be a multiple of mr.
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
|
||||
const dim_t mu = n / MR; // the n becomes m after a transposition
|
||||
const dim_t nu = n / NR; // the m becomes n after a transposition
|
||||
|
||||
// Decide which algorithm to use (block-panel var2m or panel-block
|
||||
// var1n) based on the number of micropanels in the m and n dimensions.
|
||||
// Also, recalculate the automatic thread factorization.
|
||||
if ( mu >= nu ) use_bp = TRUE;
|
||||
else /* if ( mu < nu ) */ use_bp = FALSE;
|
||||
|
||||
// If the parallel thread factorization was automatic, we update it
|
||||
// with a new factorization based on the matrix dimensions in units
|
||||
// of micropanels.
|
||||
if ( auto_factor )
|
||||
{
|
||||
if ( use_bp )
|
||||
{
|
||||
// In the block-panel algorithm, the m dimension is parallelized
|
||||
// with ic_nt and the n dimension is parallelized with jc_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
|
||||
}
|
||||
else // if ( !use_bp )
|
||||
{
|
||||
// In the panel-block algorithm, the m dimension is parallelized
|
||||
// with jc_nt and the n dimension is parallelized with ic_nt.
|
||||
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
|
||||
}
|
||||
|
||||
// Update the ways of parallelism for the jc and ic loops, and then
|
||||
// update the current thread's root thrinfo_t node according to the
|
||||
// new ways of parallelism value for the jc loop.
|
||||
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
|
||||
bli_l3_sup_thrinfo_update_root( rntm, thread );
|
||||
}
|
||||
|
||||
|
||||
if ( use_bp )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
}
|
||||
else // use_pb
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, thread );
|
||||
// *requires nudging of mc up to be a multiple of nr.
|
||||
}
|
||||
}
|
||||
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -43,3 +43,15 @@ err_t bli_gemmsup_int
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
err_t bli_gemmtsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin
|
||||
the packm stage. */ \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
|
||||
@@ -98,7 +98,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
@@ -429,7 +429,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5);\
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
@@ -57,7 +57,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t did_pack, \
|
||||
bool did_pack, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
@@ -71,7 +71,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
stor3_t stor_id, \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
@@ -95,7 +95,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
|
||||
@@ -68,7 +68,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Barrier to make sure all threads are caught up and ready to begin
|
||||
the packm stage. */ \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
\
|
||||
/* Compute the size of the memory block eneded. */ \
|
||||
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
|
||||
@@ -98,7 +98,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
|
||||
\
|
||||
/* Broadcast the address of the chief thread's passed-in mem_t
|
||||
to all threads. */ \
|
||||
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
|
||||
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
|
||||
\
|
||||
/* Non-chief threads: Copy the contents of the chief thread's
|
||||
passed-in mem_t to the passed-in mem_t for this thread. (The
|
||||
@@ -429,7 +429,7 @@ void PASTEMAC(ch,opname) \
|
||||
} \
|
||||
\
|
||||
/* Barrier so that packing is done before computation. */ \
|
||||
bli_thread_obarrier( thread ); \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); \
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
@@ -57,7 +57,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t did_pack, \
|
||||
bool did_pack, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
@@ -71,7 +71,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
stor3_t stor_id, \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
@@ -95,7 +95,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
bool will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
|
||||
@@ -94,8 +94,8 @@ void PASTEMAC(ch,varname) \
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool_t row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool_t col_stored = bli_is_row_packed( schema );*/ \
|
||||
bool row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool col_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
@@ -372,8 +372,8 @@ void PASTEMAC(ch,varname) \
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool_t col_stored = bli_is_col_packed( schema ); \
|
||||
/*bool_t row_stored = bli_is_row_packed( schema );*/ \
|
||||
bool col_stored = bli_is_col_packed( schema ); \
|
||||
/*bool row_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
if ( col_stored ) \
|
||||
{ \
|
||||
|
||||
@@ -90,14 +90,6 @@ err_t bli_gemmsup_ref
|
||||
//bli_rntm_set_pack_a( 0, rntm );
|
||||
//bli_rntm_set_pack_b( 0, rntm );
|
||||
#endif
|
||||
//bli_rntm_set_pack_a( 0, rntm );
|
||||
//bli_rntm_set_pack_b( 0, rntm );
|
||||
|
||||
// May not need these here since packm_sup infers the schemas based
|
||||
// on the stor3_t id. (This would also mean that they don't need to
|
||||
// be passed into the thread decorator below.)
|
||||
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
|
||||
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
|
||||
|
||||
return
|
||||
bli_l3_sup_thread_decorator
|
||||
@@ -114,3 +106,75 @@ err_t bli_gemmsup_ref
|
||||
);
|
||||
}
|
||||
|
||||
err_t bli_gemmtsup_ref
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// This function implements the default gemmsup handler. If you are a
|
||||
// BLIS developer and wish to use a different gemmsup handler, please
|
||||
// register a different function pointer in the context in your
|
||||
// sub-configuration's bli_cntx_init_*() function.
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
#if 0
|
||||
// NOTE: This special case handling is done within the variants.
|
||||
|
||||
// If alpha is zero, scale by beta and return.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop.
|
||||
bli_rntm_set_ways_from_rntm_sup
|
||||
(
|
||||
bli_obj_length( c ),
|
||||
bli_obj_width( c ),
|
||||
bli_obj_width( a ),
|
||||
rntm
|
||||
);
|
||||
|
||||
#if 0
|
||||
printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) );
|
||||
printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) );
|
||||
|
||||
//bli_rntm_set_pack_a( 0, rntm );
|
||||
//bli_rntm_set_pack_b( 0, rntm );
|
||||
#endif
|
||||
|
||||
return
|
||||
bli_l3_sup_thread_decorator
|
||||
(
|
||||
bli_gemmtsup_int,
|
||||
BLIS_GEMMT, // operation family id
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -120,9 +120,6 @@ void bli_gemmsup_ref_var1n
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
@@ -450,68 +447,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = m_local % NC; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. An alternative way of initializing the
|
||||
mem_t entries is:
|
||||
|
||||
bli_mem_clear( &mem_a ); \
|
||||
bli_mem_clear( &mem_b ); \
|
||||
*/ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree.
|
||||
NOTE: These bszid_t values, and their order, match that of the bp
|
||||
algorithm (variant 2) because they are not used to query actual
|
||||
blocksizes but rather query the ways of parallelism for the various
|
||||
loops. For example, the 2nd loop in variant 1 partitions in the m
|
||||
dimension (in increments of MR), but parallelizes that m dimension
|
||||
with BLIS_JR_NT. The only difference is that the _packa and _packb
|
||||
arrays have been adjusted for the semantic difference in order in
|
||||
which packa and packb nodes are encountered in the thrinfo tree.
|
||||
That is, this panel-block algorithm partitions an NC x KC submatrix
|
||||
of A to be packed in the 4th loop, and a KC x MC submatrix of B
|
||||
to be packed in the 3rd loop. */ \
|
||||
/* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \
|
||||
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t* restrict bszids; \
|
||||
\
|
||||
/* Set the bszids pointer to the correct bszids array above based on which
|
||||
matrices (if any) are being packed. */ \
|
||||
if ( packa ) { if ( packb ) bszids = bszids_packab; \
|
||||
else bszids = bszids_packa; } \
|
||||
else { if ( packb ) bszids = bszids_packb; \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jc = bszids; \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t m_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = m_local % NC; \
|
||||
\
|
||||
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -614,72 +549,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = n_local % MC; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing A. If we won't be packing A, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pa; \
|
||||
if ( packa ) { bszids_pa = &bszids_pc[1]; \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
|
||||
else { bszids_pa = &bszids_pc[0]; \
|
||||
thread_pa = thread_pc; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id.
|
||||
NOTE: packing matrix A in this panel-block algorithm corresponds
|
||||
to packing matrix B in the block-panel algorithm. */ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
|
||||
stor_id, /* a "panel of B". */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
NC, KC, /* This "panel of B" is (at most) NC x KC. */ \
|
||||
nc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
a_pc, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_pc_use = a_use; \
|
||||
\
|
||||
/* We don't need to embed the panel stride of A within the auxinfo_t
|
||||
object because this variant iterates through A in the jr loop,
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_ic = &bszids_pa[1]; \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t n_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = n_local % MC; \
|
||||
\
|
||||
/* Loop over the n dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -765,81 +634,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing A. If we won't be packing A, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pb; \
|
||||
if ( packb ) { bszids_pb = &bszids_ic[1]; \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
|
||||
else { bszids_pb = &bszids_ic[0]; \
|
||||
thread_pb = thread_ic; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then b_use will be set to point to
|
||||
b and the _b_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id.
|
||||
NOTE: packing matrix B in this panel-block algorithm corresponds
|
||||
to packing matrix A in the block-panel algorithm. */ \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
|
||||
stor_id, /* a "block of A". */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
KC, MC, /* This "block of A" is (at most) KC x MC. */ \
|
||||
kc_cur, mc_cur, NR, \
|
||||
&one_local, \
|
||||
b_ic, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_ic_use = b_use; \
|
||||
\
|
||||
/* Embed the panel stride of B within the auxinfo_t object. The
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of B. */ \
|
||||
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jr = &bszids_pb[1]; \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
||||
dim_t jr_left = nc_cur % MR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to MRE
|
||||
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. NOTE: We forgo this optimization when packing A
|
||||
since packing an extended edge case is not yet supported. */ \
|
||||
if ( !packa && !is_mt ) \
|
||||
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += MR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the m dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
@@ -882,7 +676,7 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* NOTE: This barrier is only needed if we are packing A (since
|
||||
that matrix is packed within the pc loop of this variant). */ \
|
||||
if ( packa ) bli_thread_obarrier( thread_pa ); \
|
||||
if ( packa ) bli_thread_barrier( thread_pa ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
@@ -976,9 +770,6 @@ void bli_gemmsup_ref_var2m
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
@@ -1280,57 +1071,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. An alternative way of initializing the
|
||||
mem_t entries is:
|
||||
|
||||
bli_mem_clear( &mem_a ); \
|
||||
bli_mem_clear( &mem_b ); \
|
||||
*/ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Define an array of bszid_t ids, which will act as our substitute for
|
||||
the cntl_t tree. */ \
|
||||
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
|
||||
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
|
||||
bszid_t* restrict bszids; \
|
||||
\
|
||||
/* Set the bszids pointer to the correct bszids array above based on which
|
||||
matrices (if any) are being packed. */ \
|
||||
if ( packa ) { if ( packb ) bszids = bszids_packab; \
|
||||
else bszids = bszids_packa; } \
|
||||
else { if ( packb ) bszids = bszids_packb; \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
thrinfo_t* restrict thread_pb = NULL; \
|
||||
thrinfo_t* restrict thread_ic = NULL; \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jc = bszids; \
|
||||
thread_jc = thread; \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
|
||||
\
|
||||
/* Compute the JC loop thread range for the current thread. */ \
|
||||
dim_t jc_start, jc_end; \
|
||||
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
|
||||
const dim_t n_local = jc_end - jc_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -1431,70 +1171,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing B, we alias to
|
||||
the _pc variables so that code further down can unconditionally
|
||||
reference the _pb variables. Note that *if* we will be packing
|
||||
B, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pb; \
|
||||
if ( packb ) { bszids_pb = &bszids_pc[1]; \
|
||||
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
|
||||
else { bszids_pb = &bszids_pc[0]; \
|
||||
thread_pb = thread_pc; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then a_use will be set to point to
|
||||
b and the _b_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
|
||||
stor_id, /* a "panel of B." */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
KC, NC, /* This "panel of B" is (at most) KC x NC. */ \
|
||||
kc_cur, nc_cur, NR, \
|
||||
&one_local, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread_pb \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* We don't need to embed the panel stride of B within the auxinfo_t
|
||||
object because this variant iterates through B in the jr loop,
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
|
||||
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
|
||||
\
|
||||
/* Compute the IC loop thread range for the current thread. */ \
|
||||
dim_t ic_start, ic_end; \
|
||||
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
|
||||
const dim_t m_local = ic_end - ic_start; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -1578,79 +1254,6 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Set the bszid_t array and thrinfo_t pointer based on whether
|
||||
we will be packing B. If we won't be packing A, we alias to
|
||||
the _ic variables so that code further down can unconditionally
|
||||
reference the _pa variables. Note that *if* we will be packing
|
||||
A, the thrinfo_t node will have already been created by a
|
||||
previous call to bli_thrinfo_grow(), since bszid values of
|
||||
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
|
||||
bszid that is a normal bszid_t value). */ \
|
||||
bszid_t* restrict bszids_pa; \
|
||||
if ( packa ) { bszids_pa = &bszids_ic[1]; \
|
||||
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
|
||||
else { bszids_pa = &bszids_ic[0]; \
|
||||
thread_pa = thread_ic; } \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
|
||||
stor_id, /* a "block of A." */ \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
|
||||
mc_cur, kc_cur, MR, \
|
||||
&one_local, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread_pa \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Embed the panel stride of A within the auxinfo_t object. The
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of A (if needed). */ \
|
||||
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
|
||||
\
|
||||
/* Grow the thrinfo_t tree. */ \
|
||||
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
|
||||
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
|
||||
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the JR loop. */ \
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. NOTE: We forgo this optimization when packing B
|
||||
since packing an extended edge case is not yet supported. */ \
|
||||
if ( !packb && !is_mt ) \
|
||||
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += NR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
@@ -1693,7 +1296,7 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* NOTE: This barrier is only needed if we are packing B (since
|
||||
that matrix is packed within the pc loop of this variant). */ \
|
||||
if ( packb ) bli_thread_obarrier( thread_pb ); \
|
||||
if ( packb ) bli_thread_barrier( thread_pb ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -169,15 +169,15 @@ void bli_gemmt_front
|
||||
// of the ccr or crc cases.
|
||||
// Then, after the computation is complete, this matrix will be copied
|
||||
// or accumulated back to C.
|
||||
const bool_t is_ccr_mismatch =
|
||||
const bool is_ccr_mismatch =
|
||||
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_col_stored( &c_local ) );
|
||||
const bool_t is_crc_mismatch =
|
||||
const bool is_crc_mismatch =
|
||||
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_row_stored( &c_local ) );
|
||||
|
||||
obj_t ct;
|
||||
bool_t use_ct = FALSE;
|
||||
bool use_ct = FALSE;
|
||||
|
||||
// FGVZ: Consider adding another guard here that only creates and uses a
|
||||
// temporary matrix for accumulation if k < c * kc, where c is some small
|
||||
@@ -284,24 +284,24 @@ void bli_gemmt_front
|
||||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
const bool_t a_is_real = bli_obj_is_real( a );
|
||||
const bool_t a_is_comp = bli_obj_is_complex( a );
|
||||
const bool_t b_is_real = bli_obj_is_real( b );
|
||||
const bool_t b_is_comp = bli_obj_is_complex( b );
|
||||
const bool_t c_is_real = bli_obj_is_real( c );
|
||||
const bool_t c_is_comp = bli_obj_is_complex( c );
|
||||
const bool a_is_real = bli_obj_is_real( a );
|
||||
const bool a_is_comp = bli_obj_is_complex( a );
|
||||
const bool b_is_real = bli_obj_is_real( b );
|
||||
const bool b_is_comp = bli_obj_is_complex( b );
|
||||
const bool c_is_real = bli_obj_is_real( c );
|
||||
const bool c_is_comp = bli_obj_is_complex( c );
|
||||
|
||||
const bool_t a_is_single = bli_obj_is_single_prec( a );
|
||||
const bool_t a_is_double = bli_obj_is_double_prec( a );
|
||||
const bool_t b_is_single = bli_obj_is_single_prec( b );
|
||||
const bool_t b_is_double = bli_obj_is_double_prec( b );
|
||||
const bool_t c_is_single = bli_obj_is_single_prec( c );
|
||||
const bool_t c_is_double = bli_obj_is_double_prec( c );
|
||||
const bool a_is_single = bli_obj_is_single_prec( a );
|
||||
const bool a_is_double = bli_obj_is_double_prec( a );
|
||||
const bool b_is_single = bli_obj_is_single_prec( b );
|
||||
const bool b_is_double = bli_obj_is_double_prec( b );
|
||||
const bool c_is_single = bli_obj_is_single_prec( c );
|
||||
const bool c_is_double = bli_obj_is_double_prec( c );
|
||||
|
||||
const bool_t comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
|
||||
const bool_t comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
|
||||
const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
|
||||
const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
|
||||
|
||||
const bool_t mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
|
||||
const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
|
||||
bli_obj_domain( c ) != bli_obj_domain( b );
|
||||
|
||||
( void )a_is_real; ( void )a_is_comp;
|
||||
|
||||
@@ -124,7 +124,7 @@ void bli_gemmt_ker_var2
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
bool_t uploc;
|
||||
bool uploc;
|
||||
if ( bli_obj_is_lower( c ) )
|
||||
{
|
||||
uploc = 0;
|
||||
@@ -251,7 +251,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
@@ -277,7 +277,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
* diagonal is reached, and is used to determine path during
|
||||
* next iterations of loop
|
||||
*/ \
|
||||
bool_t flag = 0; \
|
||||
bool flag = 0; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
@@ -545,7 +545,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
|
||||
@@ -38,8 +38,8 @@
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
bool_t packa,
|
||||
bool_t packb,
|
||||
bool packa,
|
||||
bool packb,
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
@@ -116,8 +116,8 @@ void bli_gemmtsup_ref_var1n
|
||||
#else
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
@@ -162,7 +162,7 @@ void bli_gemmtsup_ref_var1n
|
||||
cs_b = bli_obj_row_stride( b );
|
||||
}
|
||||
|
||||
bool_t uploc;
|
||||
bool uploc;
|
||||
if( bli_obj_is_lower( c ) )
|
||||
{
|
||||
uploc = 0;
|
||||
@@ -246,8 +246,8 @@ void bli_gemmtsup_ref_var1n
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
bool_t packa, \
|
||||
bool_t packb, \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
@@ -434,7 +434,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
@@ -720,8 +720,8 @@ INSERT_GENTFUNC_L( gemmtsup, ref_var1n )
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
bool_t packa, \
|
||||
bool_t packb, \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
@@ -908,7 +908,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
@@ -1250,8 +1250,8 @@ void bli_gemmtsup_ref_var2m
|
||||
#else
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
@@ -1296,7 +1296,7 @@ void bli_gemmtsup_ref_var2m
|
||||
cs_b = bli_obj_row_stride( b );
|
||||
}
|
||||
|
||||
bool_t uploc;
|
||||
bool uploc;
|
||||
|
||||
if ( bli_is_notrans ( trans ) )
|
||||
uploc = bli_obj_is_lower( c ) ? 0 : 1;
|
||||
@@ -1377,8 +1377,8 @@ void bli_gemmtsup_ref_var2m
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
bool_t packa, \
|
||||
bool_t packb, \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
@@ -1503,7 +1503,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
Since update routines only support row-major order,
|
||||
col_pref flag is used to induce transpose to matrices before
|
||||
passing to update routine whenever C is col-stored */ \
|
||||
const bool_t col_pref = (rs_c == 1)? 1 : 0; \
|
||||
const bool col_pref = (rs_c == 1)? 1 : 0; \
|
||||
\
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
@@ -1553,7 +1553,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
@@ -1893,8 +1893,8 @@ INSERT_GENTFUNC_L( gemmtsup, ref_var2m )
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
bool_t packa, \
|
||||
bool_t packb, \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
@@ -2019,7 +2019,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
Since update routines only support row-major order,
|
||||
col_pref flag is used to induce transpose to matrices before
|
||||
passing to update routine whenever C is col-stored */ \
|
||||
const bool_t col_pref = (rs_c == 1) ? 1 : 0; \
|
||||
const bool col_pref = (rs_c == 1) ? 1 : 0; \
|
||||
\
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
@@ -2071,7 +2071,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
else bszids = bszids_nopack; } \
|
||||
\
|
||||
/* Determine whether we are using more than one thread. */ \
|
||||
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
|
||||
\
|
||||
thrinfo_t* restrict thread_jc = NULL; \
|
||||
thrinfo_t* restrict thread_pc = NULL; \
|
||||
|
||||
@@ -107,8 +107,8 @@ GENPROT( gemmtsup_ref_var2m )
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
bool_t packa, \
|
||||
bool_t packb, \
|
||||
bool packa, \
|
||||
bool packb, \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -238,20 +238,20 @@ BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to
|
||||
);
|
||||
#endif
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_blksz_reduce_def_to
|
||||
void bli_blksz_reduce_def_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
);
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_blksz_reduce_max_to
|
||||
void bli_blksz_reduce_max_to
|
||||
(
|
||||
num_t dt_bm, blksz_t* bmult,
|
||||
num_t dt_bs, blksz_t* blksz
|
||||
);
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize
|
||||
dim_t bli_determine_blocksize
|
||||
(
|
||||
dir_t direct,
|
||||
dim_t i,
|
||||
@@ -261,7 +261,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f
|
||||
dim_t bli_determine_blocksize_f
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
@@ -270,7 +270,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_b
|
||||
dim_t bli_determine_blocksize_b
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
@@ -312,7 +312,6 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_trsm_b
|
||||
#endif
|
||||
|
||||
dim_t bli_determine_blocksize_f_sub
|
||||
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f_sub
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
@@ -320,7 +319,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f_sub
|
||||
dim_t b_max
|
||||
);
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_b_sub
|
||||
dim_t bli_determine_blocksize_b_sub
|
||||
(
|
||||
dim_t i,
|
||||
dim_t dim,
|
||||
|
||||
@@ -59,9 +59,6 @@ double bli_clock_min_diff( double time_min, double time_start )
|
||||
// - under a nanosecond
|
||||
// is actually garbled due to the clocks being taken too closely together.
|
||||
if ( time_min <= 0.0 ) time_min = time_min_prev;
|
||||
// To genuinely measure time for an application taking more than an hour, the below
|
||||
// line is commented. If wrongly measuring higher time we could always use previous_min.
|
||||
/* else if ( time_min > 3600.0 ) time_min = time_min_prev; */
|
||||
else if ( time_min < 1.0e-9 ) time_min = time_min_prev;
|
||||
|
||||
return time_min;
|
||||
|
||||
@@ -84,6 +84,10 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->bmults;
|
||||
}
|
||||
BLIS_INLINE blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->trsm_blkszs;
|
||||
}
|
||||
BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
|
||||
{
|
||||
return cntx->l3_vir_ukrs;
|
||||
@@ -333,6 +337,16 @@ BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
|
||||
return blksz;
|
||||
}
|
||||
|
||||
BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx )
|
||||
{
|
||||
blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx );
|
||||
blksz_t* blksz = &blkszs[ bs_id ];
|
||||
|
||||
// Return the address of the blksz_t identified by bs_id.
|
||||
return blksz;
|
||||
|
||||
}
|
||||
|
||||
BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
|
||||
{
|
||||
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
|
||||
|
||||
@@ -264,7 +264,7 @@ bool bli_cpuid_is_penryn
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
bool_t bli_cpuid_is_zen3
|
||||
bool bli_cpuid_is_zen3
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
@@ -283,7 +283,7 @@ bool_t bli_cpuid_is_zen3
|
||||
|
||||
// Finally, check for specific models:
|
||||
// - 0x00-0xff (THIS NEEDS UPDATING)
|
||||
const bool_t is_arch
|
||||
const bool is_arch
|
||||
=
|
||||
( 0x00 <= model && model <= 0xff );
|
||||
|
||||
@@ -292,7 +292,7 @@ bool_t bli_cpuid_is_zen3
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t bli_cpuid_is_zen2
|
||||
bool bli_cpuid_is_zen2
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
|
||||
@@ -37,10 +37,10 @@
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_env_get_var( const char* env, dim_t fallback )
|
||||
gint_t bli_env_get_var( const char* env, gint_t fallback )
|
||||
{
|
||||
dim_t r_val;
|
||||
char* str;
|
||||
gint_t r_val;
|
||||
char* str;
|
||||
|
||||
// Query the environment variable and store the result in str.
|
||||
str = getenv( env );
|
||||
@@ -50,7 +50,7 @@ dim_t bli_env_get_var( const char* env, dim_t fallback )
|
||||
{
|
||||
// If there was no error, convert the string to an integer and
|
||||
// prepare to return that integer.
|
||||
r_val = strtol( str, NULL, 10 );
|
||||
r_val = ( gint_t )strtol( str, NULL, 10 );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
#ifndef BLIS_ENV_H
|
||||
#define BLIS_ENV_H
|
||||
|
||||
dim_t bli_env_get_var( const char* env, dim_t fallback );
|
||||
gint_t bli_env_get_var( const char* env, gint_t fallback );
|
||||
//void bli_env_set_var( const char* env, dim_t value );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -77,7 +77,7 @@ dim_t bli_pack_get_pack_b( void )
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_set_pack_a( bool_t pack_a )
|
||||
void bli_pack_set_pack_a( bool pack_a )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
@@ -93,7 +93,7 @@ void bli_pack_set_pack_a( bool_t pack_a )
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_set_pack_b( bool_t pack_b )
|
||||
void bli_pack_set_pack_b( bool pack_b )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
@@ -118,25 +118,25 @@ void bli_pack_init_rntm_from_env
|
||||
// function is only called from bli_pack_init(), which is only called
|
||||
// by bli_init_once().
|
||||
|
||||
bool_t pack_a;
|
||||
bool_t pack_b;
|
||||
bool pack_a;
|
||||
bool pack_b;
|
||||
|
||||
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
|
||||
|
||||
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
|
||||
// -1 if it is unset.
|
||||
pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
|
||||
pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
|
||||
gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 );
|
||||
gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 );
|
||||
|
||||
// Enforce the default behavior first, then check for affirmative FALSE, and
|
||||
// finally assume anything else is TRUE.
|
||||
if ( pack_a == -1 ) pack_a = FALSE; // default behavior
|
||||
else if ( pack_a == 0 ) pack_a = FALSE; // zero is FALSE
|
||||
else pack_a = TRUE; // anything else is TRUE
|
||||
if ( pack_a_env == -1 ) pack_a = FALSE; // default behavior
|
||||
else if ( pack_a_env == 0 ) pack_a = FALSE; // zero is FALSE
|
||||
else pack_a = TRUE; // anything else is TRUE
|
||||
|
||||
if ( pack_b == -1 ) pack_b = FALSE; // default behavior
|
||||
else if ( pack_b == 0 ) pack_b = FALSE; // zero is FALSE
|
||||
else pack_b = TRUE; // anything else is TRUE
|
||||
if ( pack_b_env == -1 ) pack_b = FALSE; // default behavior
|
||||
else if ( pack_b_env == 0 ) pack_b = FALSE; // zero is FALSE
|
||||
else pack_b = TRUE; // anything else is TRUE
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -40,8 +40,8 @@ void bli_pack_finalize( void );
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
|
||||
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b );
|
||||
|
||||
void bli_pack_init_rntm_from_env( rntm_t* rntm );
|
||||
|
||||
|
||||
@@ -285,11 +285,11 @@ BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm )
|
||||
|
||||
BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm )
|
||||
{
|
||||
bli_rntm_set_pack_a( TRUE, rntm );
|
||||
bli_rntm_set_pack_a( FALSE, rntm );
|
||||
}
|
||||
BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm )
|
||||
{
|
||||
bli_rntm_set_pack_b( TRUE, rntm );
|
||||
bli_rntm_set_pack_b( FALSE, rntm );
|
||||
}
|
||||
BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
||||
{
|
||||
@@ -309,8 +309,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
||||
.auto_factor = TRUE, \
|
||||
.num_threads = -1, \
|
||||
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
|
||||
.pack_a = TRUE, \
|
||||
.pack_b = TRUE, \
|
||||
.pack_a = FALSE, \
|
||||
.pack_b = FALSE, \
|
||||
.l3_sup = TRUE, \
|
||||
.sba_pool = NULL, \
|
||||
.membrk = NULL, \
|
||||
|
||||
@@ -584,15 +584,15 @@ BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 )
|
||||
|
||||
// offset-relate
|
||||
|
||||
static bool_t bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
BLIS_INLINE bool bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
return ( bool )
|
||||
( ( n_off + n - 1 ) < m_off );
|
||||
}
|
||||
|
||||
static bool_t bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
BLIS_INLINE bool bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
return ( bool )
|
||||
( ( m_off + m - 1 ) < n_off );
|
||||
}
|
||||
// diag offset-related
|
||||
|
||||
@@ -65,14 +65,13 @@ err_t bli_l3_sup_thread_decorator
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
@@ -83,10 +82,7 @@ err_t bli_l3_sup_thread_decorator
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
|
||||
// There is only one thread id (for the thief thread).
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
@@ -97,24 +93,22 @@ err_t bli_l3_sup_thread_decorator
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
thrinfo_t* thread = NULL;
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
|
||||
#else
|
||||
// This optimization allows us to use one of the global thrinfo_t
|
||||
// objects for single-threaded execution rather than grow one from
|
||||
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
|
||||
// from within the variants, will immediately return if it detects
|
||||
// that the thrinfo_t* passed into it is either
|
||||
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
|
||||
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
|
||||
|
||||
( void )tid;
|
||||
#endif
|
||||
|
||||
func
|
||||
(
|
||||
@@ -125,17 +119,12 @@ err_t bli_l3_sup_thread_decorator
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
#ifndef SKIP_THRINFO_TREE
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
bli_l3_sup_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -1062,6 +1062,8 @@ void bli_thread_partition_2x2
|
||||
{
|
||||
*nt1 = ( work1 >= work2 ? n_thread : 1 );
|
||||
*nt2 = ( work1 < work2 ? n_thread : 1 );
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
*nt1 = 1;
|
||||
|
||||
@@ -205,7 +205,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
|
||||
|
||||
// Broadcast the temporary array to all threads in the parent's
|
||||
// communicator.
|
||||
new_comms = bli_thread_obroadcast( thread_par, new_comms );
|
||||
new_comms = bli_thread_broadcast( thread_par, new_comms );
|
||||
|
||||
// Chiefs in the child communicator allocate the communicator
|
||||
// object and store it in the array element corresponding to the
|
||||
@@ -213,7 +213,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
|
||||
if ( child_comm_id == 0 )
|
||||
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// All threads create a new thrinfo_t node using the communicator
|
||||
// that was created by their chief, as identified by parent_work_id.
|
||||
@@ -229,7 +229,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
|
||||
NULL // sub_node
|
||||
);
|
||||
|
||||
bli_thread_obarrier( thread_par );
|
||||
bli_thread_barrier( thread_par );
|
||||
|
||||
// The parent's chief thread frees the temporary array of thrcomm_t
|
||||
// pointers.
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc
|
||||
Copyright (C) 2016 - 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -4,11 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
<<<<<<< HEAD
|
||||
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
|
||||
=======
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
>>>>>>> Merged BLIS Release 1.3
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -4,11 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
<<<<<<< HEAD
|
||||
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
|
||||
=======
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
>>>>>>> Merged BLIS Release 1.3
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -233,7 +233,7 @@ static err_t bli_sgemm_small
|
||||
float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
/*Beta Zero Check*/
|
||||
bool_t is_beta_non_zero=0;
|
||||
bool is_beta_non_zero=0;
|
||||
if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){
|
||||
is_beta_non_zero = 1;
|
||||
}
|
||||
@@ -1805,7 +1805,7 @@ static err_t bli_dgemm_small
|
||||
//checking whether beta value is zero.
|
||||
//if true, we should perform C=alpha * A*B operation
|
||||
//instead of C = beta * C + alpha * (A * B)
|
||||
bool_t is_beta_non_zero = 0;
|
||||
bool is_beta_non_zero = 0;
|
||||
if(!bli_obj_equals(beta, &BLIS_ZERO))
|
||||
is_beta_non_zero = 1;
|
||||
|
||||
@@ -3362,7 +3362,7 @@ static err_t bli_sgemm_small_atbn
|
||||
float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
/*Beta Zero Check*/
|
||||
bool_t is_beta_non_zero=0;
|
||||
bool is_beta_non_zero=0;
|
||||
if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){
|
||||
is_beta_non_zero = 1;
|
||||
}
|
||||
@@ -3843,7 +3843,7 @@ static err_t bli_dgemm_small_atbn
|
||||
//check if beta is zero
|
||||
//if true, we need to perform C = alpha * (A * B)
|
||||
//instead of C = beta * C + alpha * (A * B)
|
||||
bool_t is_beta_non_zero = 0;
|
||||
bool is_beta_non_zero = 0;
|
||||
if(!bli_obj_equals(beta,&BLIS_ZERO))
|
||||
is_beta_non_zero = 1;
|
||||
|
||||
|
||||
29812
output.testsuite
Normal file
29812
output.testsuite
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,2 +1,2 @@
|
||||
2
|
||||
2.1
|
||||
3
|
||||
0.0
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
% tx2
|
||||
plot_panel_4x5(2.2,8,1, '../results/tx2/st', 'tx2', 'ARMPL'); close; clear all;
|
||||
plot_panel_4x5(2.2,8,28,'../results/tx2/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
|
||||
plot_panel_4x5(2.2,8,56,'../results/tx2/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
|
||||
|
||||
% skx
|
||||
plot_panel_4x5(2.0,32,1,'../results/skx/st/20190218','skx','MKL'); close; clear all;
|
||||
plot_panel_4x5(2.0,32,26,'../results/skx/jc2ic13/20190218','skx_jc2ic13','MKL'); close; clear all;
|
||||
plot_panel_4x5(2.0,32,52,'../results/skx/jc4ic13/20190218','skx_jc4ic13','MKL'); close; clear all;
|
||||
@@ -1,35 +0,0 @@
|
||||
% tx2
|
||||
plot_panel_4x5(2.20,8,1, 'st','../results/tx2/20190205/st', 'tx2', 'ARMPL'); close; clear all;
|
||||
plot_panel_4x5(2.20,8,28,'1s','../results/tx2/20190205/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
|
||||
plot_panel_4x5(2.20,8,56,'2s','../results/tx2/20190205/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
|
||||
|
||||
% skx
|
||||
% pre-eigen:
|
||||
%plot_panel_4x5(2.00,32,1, 'st','../results/skx/20190306/st', 'skx', 'MKL'); close; clear all;
|
||||
%plot_panel_4x5(2.00,32,26,'1s','../results/skx/20190306/jc2ic13','skx_jc2ic13','MKL'); close; clear all;
|
||||
%plot_panel_4x5(2.00,32,52,'2s','../results/skx/20190306/jc4ic13','skx_jc4ic13','MKL'); close; clear all;
|
||||
% with eigen:
|
||||
plot_panel_4x5(2.00,32,1, 'st','../results/skx/merged20190306_0328/st', 'skx', 'MKL',1); close; clear all;
|
||||
plot_panel_4x5(2.00,32,26,'1s','../results/skx/merged20190306_0328/jc2ic13','skx_jc2ic13','MKL',1); close; clear all;
|
||||
plot_panel_4x5(2.00,32,52,'2s','../results/skx/merged20190306_0328/jc4ic13','skx_jc4ic13','MKL',1); close; clear all;
|
||||
|
||||
% has
|
||||
% pre-eigen:
|
||||
%plot_panel_4x5(3.25,16,1, 'st','../results/has/20190206/st', 'has', 'MKL',1); close; clear all;
|
||||
%plot_panel_4x5(3.00,16,12,'1s','../results/has/20190206/jc2ic3jr2','has_jc2ic3jr2','MKL',1); close; clear all;
|
||||
%plot_panel_4x5(3.00,16,24,'2s','../results/has/20190206/jc4ic3jr2','has_jc4ic3jr2','MKL',1); close; clear all;
|
||||
% with eigen:
|
||||
plot_panel_4x5(3.25,16,1, 'st','../results/has/merged20190206_0328/st', 'has', 'MKL',1); close; clear all;
|
||||
plot_panel_4x5(3.00,16,12,'1s','../results/has/merged20190206_0328/jc2ic3jr2','has_jc2ic3jr2','MKL',1); close; clear all;
|
||||
plot_panel_4x5(3.00,16,24,'2s','../results/has/merged20190206_0328/jc4ic3jr2','has_jc4ic3jr2','MKL',1); close; clear all;
|
||||
|
||||
% epyc
|
||||
% pre-eigen:
|
||||
%plot_panel_4x5(3.00,8,1, 'st','../results/epyc/merged201903_0619/st','epyc', 'MKL'); close; clear all;
|
||||
%plot_panel_4x5(2.55,8,32,'1s','../results/epyc/merged201903_0619/jc1ic8jr4','epyc_jc1ic8jr4','MKL'); close; clear all;
|
||||
%plot_panel_4x5(2.55,8,64,'2s','../results/epyc/merged201903_0619/jc2ic8jr4','epyc_jc2ic8jr4','MKL'); close; clear all;
|
||||
% with eigen:
|
||||
plot_panel_4x5(3.00,8,1, 'st','../results/epyc/merged20190306_0319_0328/st', 'epyc', 'MKL',1); close; clear all;
|
||||
plot_panel_4x5(2.55,8,32,'1s','../results/epyc/merged20190306_0319_0328/jc1ic8jr4','epyc_jc1ic8jr4','MKL',1); close; clear all;
|
||||
plot_panel_4x5(2.55,8,64,'2s','../results/epyc/merged20190306_0319_0328/jc2ic8jr4','epyc_jc2ic8jr4','MKL',1); close; clear all;
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
||||
@@ -505,7 +505,6 @@ eigen-mt: check-env $(EIGEN_MT_BINS)
|
||||
openblas-mt: check-env $(OPENBLAS_MT_BINS)
|
||||
vendor-mt: check-env $(VENDOR_MT_BINS)
|
||||
|
||||
# -- Multithreaded --
|
||||
|
||||
# --- Object file rules --------------------------------------------------------
|
||||
|
||||
@@ -541,29 +540,6 @@ $(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(ld),$(imp
|
||||
|
||||
# -- Multithreaded BLAS --
|
||||
|
||||
# -- Multithreaded BLAS --
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the various multithreaded implementations.
|
||||
define make-mt-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
|
||||
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS
|
||||
# implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS_MT), \
|
||||
$(foreach sn,$(SNS_MT), \
|
||||
$(foreach sk,$(SKS_MT), \
|
||||
$(foreach impl,$(BIMPLS_MT), \
|
||||
$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
# -- Single-threaded Eigen --
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the various multithreaded implementations.
|
||||
define make-mt-rule
|
||||
@@ -626,26 +602,6 @@ $(foreach ld,$(LDIMS), \
|
||||
$(foreach impl,$(EIMPLS), \
|
||||
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(ld),$(impl))))))))))))
|
||||
|
||||
# -- Multithreaded Eigen --
|
||||
|
||||
# Define the function that will be used to instantiate compilation rules
|
||||
# for the multithreaded Eigen implementation.
|
||||
define make-eigmt-rule
|
||||
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
|
||||
$(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
|
||||
endef
|
||||
|
||||
# Instantiate the rule function make-st-rule() for each Eigen implementation.
|
||||
$(foreach dt,$(DTS), \
|
||||
$(foreach tr,$(TRANS), \
|
||||
$(foreach st,$(STORS), \
|
||||
$(foreach sh,$(SHAPES), \
|
||||
$(foreach sm,$(SMS_MT), \
|
||||
$(foreach sn,$(SNS_MT), \
|
||||
$(foreach sk,$(SKS_MT), \
|
||||
$(foreach impl,$(EIMPLS), \
|
||||
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
|
||||
|
||||
|
||||
# --- Executable file rules ----------------------------------------------------
|
||||
|
||||
|
||||
@@ -1,29 +1,49 @@
|
||||
function r_val = plot_l3sup_perf( opname, ...
|
||||
smalldims, ...
|
||||
data_blissup, ...
|
||||
data_blislpab, ...
|
||||
data_blisconv, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_vend, vend_str, ...
|
||||
data_bfeo, ...
|
||||
data_xsmm, ...
|
||||
data_vend, vend_str, ...
|
||||
nth, ...
|
||||
rows, cols, ...
|
||||
cfreq, ...
|
||||
dfps, ...
|
||||
theid, impl )
|
||||
|
||||
%if ... %mod(theid-1,cols) == 2 || ...
|
||||
% ... %mod(theid-1,cols) == 3 || ...
|
||||
% ... %mod(theid-1,cols) == 4 || ...
|
||||
% 0 == 1 ... %theid >= 19
|
||||
% show_plot = 0;
|
||||
% Define the column in which the performance rates are found.
|
||||
flopscol = size( data_blissup, 2 );
|
||||
|
||||
% Check if blasfeo data is available.
|
||||
has_bfeo = 1;
|
||||
if data_bfeo( 1, flopscol ) == 0.0
|
||||
has_bfeo = 0;
|
||||
end
|
||||
|
||||
% Check if libxsmm data is available.
|
||||
has_xsmm = 1;
|
||||
if data_xsmm( 1, flopscol ) == 0.0
|
||||
has_xsmm = 0;
|
||||
end
|
||||
|
||||
% Define which plot id will have the legend.
|
||||
% NOTE: We can draw the legend on any graph as long as it has already been
|
||||
% rendered. Since the coordinates are global, we can simply always wait until
|
||||
% the final graph to draw the legend.
|
||||
%if nth == 1
|
||||
% if has_xsmm == 1
|
||||
% legend_plot_id = 2*cols + 1*5;
|
||||
% else
|
||||
% legend_plot_id = 1*cols + 1*5;
|
||||
% end
|
||||
%else
|
||||
show_plot = 1;
|
||||
% legend_plot_id = 0*cols + 1*6;
|
||||
%end
|
||||
legend_plot_id = cols*rows;
|
||||
|
||||
%legend_plot_id = 11;
|
||||
legend_plot_id = 2*cols + 1*5;
|
||||
|
||||
% Hold the axes.
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
hold( ax1, 'on' );
|
||||
@@ -31,12 +51,12 @@ end
|
||||
|
||||
% Set line properties.
|
||||
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
|
||||
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
|
||||
color_blisconv = 'k'; lines_blisconv = ':'; markr_blisconv = '';
|
||||
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
|
||||
color_open = 'r'; lines_open = '--'; markr_open = 'o';
|
||||
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
|
||||
color_bfeo = 'c'; lines_bfeo = '-'; markr_bfeo = 'o';
|
||||
color_xsmm = 'g'; lines_xsmm = '-'; markr_xsmm = 'o';
|
||||
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
|
||||
|
||||
% Compute the peak performance in terms of the number of double flops
|
||||
% executable per cycle and the clock rate.
|
||||
@@ -55,15 +75,13 @@ titlename = '%s';
|
||||
titlename = sprintf( titlename, title_opname );
|
||||
|
||||
% Set the legend strings.
|
||||
blissup_legend = sprintf( 'BLIS sup' );
|
||||
blislpab_legend = sprintf( 'BLIS conv' );
|
||||
eigen_legend = sprintf( 'Eigen' );
|
||||
open_legend = sprintf( 'OpenBLAS' );
|
||||
bfeo_legend = sprintf( 'BLASFEO' );
|
||||
xsmm_legend = sprintf( 'libxsmm' );
|
||||
%vend_legend = sprintf( 'MKL' );
|
||||
%vend_legend = sprintf( 'ARMPL' );
|
||||
vend_legend = vend_str;
|
||||
blissup_lg = sprintf( 'BLIS sup' );
|
||||
blisconv_lg = sprintf( 'BLIS conv' );
|
||||
eigen_lg = sprintf( 'Eigen' );
|
||||
open_lg = sprintf( 'OpenBLAS' );
|
||||
vend_lg = vend_str;
|
||||
bfeo_lg = sprintf( 'BLASFEO' );
|
||||
xsmm_lg = sprintf( 'libxsmm' );
|
||||
|
||||
% Set axes range values.
|
||||
y_scale = 1.00;
|
||||
@@ -81,7 +99,6 @@ end
|
||||
|
||||
|
||||
%flopscol = 4;
|
||||
flopscol = size( data_blissup, 2 );
|
||||
msize = 5;
|
||||
if 1
|
||||
fontsize = 12;
|
||||
@@ -111,67 +128,44 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
|
||||
has_xsmm = 1;
|
||||
if data_xsmm( 1, flopscol ) == 0.0
|
||||
has_xsmm = 0;
|
||||
end
|
||||
|
||||
% Grab the last x-axis value.
|
||||
x_end = data_blissup( np, psize_col );
|
||||
|
||||
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
|
||||
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
|
||||
|
||||
if show_plot == 1
|
||||
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blisconv_ln = line( x_axis( 1:np, 1 ), data_blisconv( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blisconv, 'LineStyle',lines_blisconv, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
|
||||
'LineWidth',linesize );
|
||||
if has_xsmm == 1
|
||||
xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
|
||||
'LineWidth',linesize );
|
||||
else
|
||||
xsmm_ln = line( nan, nan, ...
|
||||
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
elseif theid == legend_plot_id
|
||||
blissup_ln = line( nan, nan, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( nan, nan, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( nan, nan, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( nan, nan, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
bfeo_ln = line( nan, nan, ...
|
||||
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
|
||||
'LineWidth',linesize );
|
||||
xsmm_ln = line( nan, nan, ...
|
||||
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( nan, nan, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
if has_bfeo == 1
|
||||
bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
|
||||
'LineWidth',linesize );
|
||||
else
|
||||
bfeo_ln = line( nan, nan, ...
|
||||
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
if has_xsmm == 1
|
||||
xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
|
||||
'LineWidth',linesize );
|
||||
else
|
||||
xsmm_ln = line( nan, nan, ...
|
||||
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
|
||||
|
||||
@@ -203,71 +197,51 @@ elseif 500 <= x_end && x_end < 1000
|
||||
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
|
||||
end
|
||||
|
||||
if show_plot == 1 || theid == legend_plot_id
|
||||
if nth == 1 && theid == legend_plot_id
|
||||
if has_xsmm == 1
|
||||
leg = legend( ...
|
||||
[ ...
|
||||
blissup_ln ...
|
||||
blislpab_ln ...
|
||||
eigen_ln ...
|
||||
open_ln ...
|
||||
bfeo_ln ...
|
||||
xsmm_ln ...
|
||||
vend_ln ...
|
||||
], ...
|
||||
blissup_legend, ...
|
||||
blislpab_legend, ...
|
||||
eigen_legend, ...
|
||||
open_legend, ...
|
||||
bfeo_legend, ...
|
||||
xsmm_legend, ...
|
||||
vend_legend, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[15.40 4.75 1.9 1.20] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-3 );
|
||||
set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
% xpos ypos
|
||||
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
|
||||
if nth == 1 && theid == legend_plot_id
|
||||
if has_xsmm == 1
|
||||
% single-threaded, with libxsmm (ccc)
|
||||
leg = legend( ...
|
||||
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln bfeo_ln xsmm_ln ], ...
|
||||
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, bfeo_lg, xsmm_lg, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off','Color','none','Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[15.35 4.62 1.9 1.20] ); % (1,4tl)
|
||||
else
|
||||
leg = legend( ...
|
||||
[ ...
|
||||
blissup_ln ...
|
||||
blislpab_ln ...
|
||||
eigen_ln ...
|
||||
open_ln ...
|
||||
bfeo_ln ...
|
||||
vend_ln ...
|
||||
], ...
|
||||
blissup_legend, ...
|
||||
blislpab_legend, ...
|
||||
eigen_legend, ...
|
||||
open_legend, ...
|
||||
bfeo_legend, ...
|
||||
vend_legend, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[15.40 7.65 1.9 1.10] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
|
||||
end
|
||||
set( leg,'FontSize',fontsize-3 );
|
||||
set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
% xpos ypos
|
||||
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
|
||||
elseif nth > 1 && theid == legend_plot_id
|
||||
else
|
||||
% single-threaded, without libxsmm (rrr, or other)
|
||||
leg = legend( ...
|
||||
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln bfeo_ln ], ...
|
||||
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, bfeo_lg, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off','Color','none','Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[15.35 7.40 1.9 1.10] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
|
||||
end
|
||||
end
|
||||
elseif nth > 1 && theid == legend_plot_id
|
||||
% multithreaded
|
||||
leg = legend( ...
|
||||
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln ], ...
|
||||
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off','Color','none','Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[18.20 10.30 1.9 0.95] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -298,28 +272,38 @@ else % impl == 'matlab'
|
||||
set( titl, 'Position', tpos );
|
||||
end
|
||||
|
||||
sll_str = sprintf( 'm = %u; n = k', smalldims(1) );
|
||||
lsl_str = sprintf( 'n = %u; m = k', smalldims(2) );
|
||||
lls_str = sprintf( 'k = %u; m = n', smalldims(3) );
|
||||
lss_str = sprintf( 'm; n = %u, k = %u', smalldims(2), smalldims(3) );
|
||||
sls_str = sprintf( 'n; m = %u, k = %u', smalldims(1), smalldims(3) );
|
||||
ssl_str = sprintf( 'k; m = %u, n = %u', smalldims(1), smalldims(2) );
|
||||
lll_str = sprintf( 'm = n = k' );
|
||||
|
||||
% Place labels on the bottom row of graphs.
|
||||
if theid > (rows-1)*cols
|
||||
%xlab = xlabel( ax1,xaxisname );
|
||||
%tpos = get( xlab, 'Position' )
|
||||
%tpos(2) = tpos(2) + 10;
|
||||
%set( xlab, 'Position', tpos );
|
||||
if theid == rows*cols - 6
|
||||
xlab = xlabel( ax1, 'm = 6; n = k' );
|
||||
xlab = xlabel( ax1, sll_str );
|
||||
elseif theid == rows*cols - 5
|
||||
xlab = xlabel( ax1, 'n = 8; m = k' );
|
||||
xlab = xlabel( ax1, lsl_str );
|
||||
elseif theid == rows*cols - 4
|
||||
xlab = xlabel( ax1, 'k = 4; m = n' );
|
||||
xlab = xlabel( ax1, lls_str );
|
||||
elseif theid == rows*cols - 3
|
||||
xlab = xlabel( ax1, 'm; n = 8, k = 4' );
|
||||
xlab = xlabel( ax1, lss_str );
|
||||
elseif theid == rows*cols - 2
|
||||
xlab = xlabel( ax1, 'n; m = 6, k = 4' );
|
||||
xlab = xlabel( ax1, sls_str );
|
||||
elseif theid == rows*cols - 1
|
||||
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
|
||||
xlab = xlabel( ax1, ssl_str );
|
||||
elseif theid == rows*cols - 0
|
||||
xlab = xlabel( ax1, 'm = n = k' );
|
||||
xlab = xlabel( ax1, lll_str );
|
||||
end
|
||||
end
|
||||
|
||||
% Place labels on the left-hand column of graphs.
|
||||
if mod(theid-1,cols) == 0
|
||||
ylab = ylabel( ax1,yaxisname );
|
||||
end
|
||||
@@ -37,7 +37,6 @@ filetemp_blissup = '%s/output_%s_%s_blissup.m';
|
||||
filetemp_blisconv = '%s/output_%s_%s_blisconv.m';
|
||||
filetemp_eigen = '%s/output_%s_%s_eigen.m';
|
||||
filetemp_open = '%s/output_%s_%s_openblas.m';
|
||||
filetemp_bfeo = '%s/output_%s_%s_blasfeo.m';
|
||||
filetemp_vend = '%s/output_%s_%s_vendor.m';
|
||||
filetemp_bfeo = '%s/output_%s_%s_blasfeo.m';
|
||||
filetemp_xsmm = '%s/output_%s_%s_libxsmm.m';
|
||||
@@ -107,7 +106,6 @@ for opi = 1:n_opsupnames
|
||||
data_blisconv, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_bfeo, ...
|
||||
data_vend, vend_str, ...
|
||||
data_bfeo, ...
|
||||
data_xsmm, ...
|
||||
@@ -1,52 +0,0 @@
|
||||
function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims )
|
||||
|
||||
nops = size( ops, 1 );
|
||||
|
||||
smallm = smalldims( 1 );
|
||||
smalln = smalldims( 2 );
|
||||
smallk = smalldims( 3 );
|
||||
|
||||
i = 1;
|
||||
|
||||
for io = 1:nops
|
||||
|
||||
op = ops( io, : );
|
||||
|
||||
str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm );
|
||||
str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln );
|
||||
str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk );
|
||||
str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk );
|
||||
str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk );
|
||||
str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln );
|
||||
str6 = sprintf( '%s_%s_mpnpkp', op, stor );
|
||||
|
||||
%opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm )
|
||||
%opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln )
|
||||
%opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk )
|
||||
%opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk )
|
||||
%opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk )
|
||||
%opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln )
|
||||
%opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor )
|
||||
|
||||
opsupnames( i+0, : ) = sprintf( '%-20s', str0 );
|
||||
opsupnames( i+1, : ) = sprintf( '%-20s', str1 );
|
||||
opsupnames( i+2, : ) = sprintf( '%-20s', str2 );
|
||||
opsupnames( i+3, : ) = sprintf( '%-20s', str3 );
|
||||
opsupnames( i+4, : ) = sprintf( '%-20s', str4 );
|
||||
opsupnames( i+5, : ) = sprintf( '%-20s', str5 );
|
||||
opsupnames( i+6, : ) = sprintf( '%-20s', str6 );
|
||||
|
||||
opnames( i+0, : ) = sprintf( '%s', op );
|
||||
opnames( i+1, : ) = sprintf( '%s', op );
|
||||
opnames( i+2, : ) = sprintf( '%s', op );
|
||||
opnames( i+3, : ) = sprintf( '%s', op );
|
||||
opnames( i+4, : ) = sprintf( '%s', op );
|
||||
opnames( i+5, : ) = sprintf( '%s', op );
|
||||
opnames( i+6, : ) = sprintf( '%s', op );
|
||||
|
||||
i = i + 7;
|
||||
end
|
||||
|
||||
r_val1 = opsupnames;
|
||||
r_val2 = opnames;
|
||||
|
||||
@@ -1,274 +0,0 @@
|
||||
function r_val = plot_l3sup_perf( opname, ...
|
||||
data_blissup, ...
|
||||
data_blislpab, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_vend, vend_str, ...
|
||||
nth, ...
|
||||
rows, cols, ...
|
||||
cfreq, ...
|
||||
dfps, ...
|
||||
theid, impl )
|
||||
|
||||
%if ... %mod(theid-1,cols) == 2 || ...
|
||||
% ... %mod(theid-1,cols) == 3 || ...
|
||||
% ... %mod(theid-1,cols) == 4 || ...
|
||||
% 0 == 1 ... %theid >= 19
|
||||
% show_plot = 0;
|
||||
%else
|
||||
show_plot = 1;
|
||||
%end
|
||||
|
||||
%legend_plot_id = 11;
|
||||
legend_plot_id = 0*cols + 1*6;
|
||||
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
hold( ax1, 'on' );
|
||||
end
|
||||
|
||||
% Set line properties.
|
||||
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
|
||||
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
|
||||
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
|
||||
color_open = 'r'; lines_open = '--'; markr_open = 'o';
|
||||
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
|
||||
|
||||
% Compute the peak performance in terms of the number of double flops
|
||||
% executable per cycle and the clock rate.
|
||||
if opname(1) == 's' || opname(1) == 'c'
|
||||
flopspercycle = dfps * 2;
|
||||
else
|
||||
flopspercycle = dfps;
|
||||
end
|
||||
max_perf_core = (flopspercycle * cfreq) * 1;
|
||||
|
||||
% Escape underscores in the title.
|
||||
title_opname = strrep( opname, '_', '\_' );
|
||||
|
||||
% Print the title to a string.
|
||||
titlename = '%s';
|
||||
titlename = sprintf( titlename, title_opname );
|
||||
|
||||
% Set the legend strings.
|
||||
blissup_legend = sprintf( 'BLIS sup' );
|
||||
blislpab_legend = sprintf( 'BLIS conv' );
|
||||
eigen_legend = sprintf( 'Eigen' );
|
||||
open_legend = sprintf( 'OpenBLAS' );
|
||||
%vend_legend = sprintf( 'MKL' );
|
||||
%vend_legend = sprintf( 'ARMPL' );
|
||||
vend_legend = vend_str;
|
||||
|
||||
% Set axes range values.
|
||||
y_scale = 1.00;
|
||||
x_begin = 0;
|
||||
%x_end is set below.
|
||||
y_begin = 0;
|
||||
y_end = max_perf_core * y_scale;
|
||||
|
||||
% Set axes names.
|
||||
if nth == 1
|
||||
yaxisname = 'GFLOPS';
|
||||
else
|
||||
yaxisname = 'GFLOPS/core';
|
||||
end
|
||||
|
||||
|
||||
%flopscol = 4;
|
||||
flopscol = size( data_blissup, 2 );
|
||||
msize = 5;
|
||||
if 1
|
||||
fontsize = 12;
|
||||
else
|
||||
fontsize = 16;
|
||||
end
|
||||
linesize = 0.5;
|
||||
legend_loc = 'southeast';
|
||||
|
||||
% --------------------------------------------------------------------
|
||||
|
||||
% Automatically detect a column with the increasing problem size.
|
||||
% Then set the maximum x-axis value.
|
||||
for psize_col = 1:3
|
||||
if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col )
|
||||
break;
|
||||
end
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
% Compute the number of data points we have in the x-axis. Note that we
|
||||
% only use half the data points for the m = n = k column of graphs.
|
||||
%if mod(theid-1,cols) == 6
|
||||
% np = size( data_blissup, 1 ) / 2;
|
||||
%else
|
||||
% np = size( data_blissup, 1 );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
|
||||
% Grab the last x-axis value.
|
||||
x_end = data_blissup( np, psize_col );
|
||||
|
||||
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
|
||||
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
|
||||
|
||||
if show_plot == 1
|
||||
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
elseif theid == legend_plot_id
|
||||
blissup_ln = line( nan, nan, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
blislpab_ln = line( nan, nan, ...
|
||||
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
|
||||
'LineWidth',linesize );
|
||||
eigen_ln = line( nan, nan, ...
|
||||
'Color',color_eigen, 'LineStyle',lines_eigen, ...
|
||||
'LineWidth',linesize );
|
||||
open_ln = line( nan, nan, ...
|
||||
'Color',color_open, 'LineStyle',lines_open, ...
|
||||
'LineWidth',linesize );
|
||||
vend_ln = line( nan, nan, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
|
||||
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
|
||||
if nth == 12
|
||||
ylim( ax1, [y_begin y_end/2] );
|
||||
elseif nth > 12
|
||||
ylim( ax1, [y_begin y_end/6] );
|
||||
end
|
||||
end
|
||||
|
||||
if 10000 <= x_end && x_end < 15000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ 4000 8000 12000 ] );
|
||||
elseif 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 2000 <= x_end && x_end < 3000
|
||||
x_tick2 = x_end - 400;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 500 <= x_end && x_end < 1000
|
||||
x_tick3 = x_end*(3/4);
|
||||
x_tick2 = x_end*(2/4);
|
||||
x_tick1 = x_end*(1/4);
|
||||
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
|
||||
end
|
||||
|
||||
if show_plot == 1 || theid == legend_plot_id
|
||||
if theid == legend_plot_id
|
||||
leg = legend( ...
|
||||
[ ...
|
||||
blissup_ln ...
|
||||
blislpab_ln ...
|
||||
eigen_ln ...
|
||||
open_ln ...
|
||||
vend_ln ...
|
||||
], ...
|
||||
blissup_legend, ...
|
||||
blislpab_legend, ...
|
||||
eigen_legend, ...
|
||||
open_legend, ...
|
||||
vend_legend, ...
|
||||
'Location', legend_loc );
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
% xpos ypos
|
||||
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
|
||||
end
|
||||
end
|
||||
|
||||
set( ax1,'FontSize',fontsize );
|
||||
set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
|
||||
box( ax1, 'on' );
|
||||
|
||||
titl = title( titlename );
|
||||
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
|
||||
|
||||
% The default is to align the plot title across whole figure, not the box.
|
||||
% This is a hack to nudge the title back to the center of the box.
|
||||
if impl == 'octave'
|
||||
tpos = get( titl, 'Position' );
|
||||
% For some reason, the titles in the graphs in the last column start
|
||||
% off in a different relative position than the graphs in the other
|
||||
% columns. Here, we manually account for that.
|
||||
if mod(theid-1,cols) == 6
|
||||
tpos(1) = tpos(1) + -10;
|
||||
else
|
||||
tpos(1) = tpos(1) + -40;
|
||||
end
|
||||
set( titl, 'Position', tpos );
|
||||
set( titl, 'FontSize', fontsize );
|
||||
else % impl == 'matlab'
|
||||
tpos = get( titl, 'Position' );
|
||||
tpos(1) = tpos(1) + 90;
|
||||
set( titl, 'Position', tpos );
|
||||
end
|
||||
|
||||
if theid > (rows-1)*cols
|
||||
%xlab = xlabel( ax1,xaxisname );
|
||||
%tpos = get( xlab, 'Position' )
|
||||
%tpos(2) = tpos(2) + 10;
|
||||
%set( xlab, 'Position', tpos );
|
||||
if theid == rows*cols - 6
|
||||
xlab = xlabel( ax1, 'm = 6; n = k' );
|
||||
elseif theid == rows*cols - 5
|
||||
xlab = xlabel( ax1, 'n = 8; m = k' );
|
||||
elseif theid == rows*cols - 4
|
||||
xlab = xlabel( ax1, 'k = 10; m = n' );
|
||||
elseif theid == rows*cols - 3
|
||||
xlab = xlabel( ax1, 'm; n = 8, k = 10' );
|
||||
elseif theid == rows*cols - 2
|
||||
xlab = xlabel( ax1, 'n; m = 6, k = 10' );
|
||||
elseif theid == rows*cols - 1
|
||||
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
|
||||
elseif theid == rows*cols - 0
|
||||
xlab = xlabel( ax1, 'm = n = k' );
|
||||
end
|
||||
end
|
||||
|
||||
if mod(theid-1,cols) == 0
|
||||
ylab = ylabel( ax1,yaxisname );
|
||||
end
|
||||
|
||||
r_val = 0;
|
||||
|
||||
@@ -1,152 +0,0 @@
|
||||
function r_val = plot_panel_trxsh ...
|
||||
( ...
|
||||
cfreq, ...
|
||||
dflopspercycle, ...
|
||||
nth, ...
|
||||
thr_str, ...
|
||||
dt_ch, ...
|
||||
stor_str, ...
|
||||
smalldims, ...
|
||||
dirpath, ...
|
||||
arch_str, ...
|
||||
vend_str, ...
|
||||
impl ...
|
||||
)
|
||||
|
||||
%cfreq = 1.8;
|
||||
%dflopspercycle = 32;
|
||||
|
||||
% Create filename "templates" for the files that contain the performance
|
||||
% results.
|
||||
filetemp_blissup = '%s/output_%s_%s_blissup.m';
|
||||
filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
|
||||
filetemp_eigen = '%s/output_%s_%s_eigen.m';
|
||||
filetemp_open = '%s/output_%s_%s_openblas.m';
|
||||
filetemp_vend = '%s/output_%s_%s_vendor.m';
|
||||
|
||||
% Create a variable name "template" for the variables contained in the
|
||||
% files outlined above.
|
||||
vartemp = 'data_%s_%s_%s( :, : )';
|
||||
|
||||
% Define the datatypes and operations we will be plotting.
|
||||
oproot = sprintf( '%cgemm', dt_ch );
|
||||
ops( 1, : ) = sprintf( '%s_nn', oproot );
|
||||
ops( 2, : ) = sprintf( '%s_nt', oproot );
|
||||
ops( 3, : ) = sprintf( '%s_tn', oproot );
|
||||
ops( 4, : ) = sprintf( '%s_tt', oproot );
|
||||
|
||||
% Generate datatype-specific operation names from the set of operations
|
||||
% and datatypes.
|
||||
[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims );
|
||||
n_opsupnames = size( opsupnames, 1 );
|
||||
|
||||
%opsupnames
|
||||
%opnames
|
||||
%return
|
||||
|
||||
if 1 == 1
|
||||
%fig = figure('Position', [100, 100, 2400, 1500]);
|
||||
fig = figure('Position', [100, 100, 2400, 1200]);
|
||||
orient( fig, 'portrait' );
|
||||
set(gcf,'PaperUnits', 'inches');
|
||||
if impl == 'matlab'
|
||||
set(gcf,'PaperSize', [11.5 20.4]);
|
||||
set(gcf,'PaperPosition', [0 0 11.5 20.4]);
|
||||
set(gcf,'PaperPositionMode','manual');
|
||||
else % impl == 'octave' % octave 4.x
|
||||
set(gcf,'PaperSize', [12 21.5]);
|
||||
set(gcf,'PaperPositionMode','auto');
|
||||
end
|
||||
set(gcf,'PaperOrientation','landscape');
|
||||
end
|
||||
|
||||
|
||||
% Iterate over the list of datatype-specific operation names.
|
||||
for opi = 1:n_opsupnames
|
||||
%for opi = 1:1
|
||||
|
||||
% Grab the current datatype combination.
|
||||
opsupname = opsupnames( opi, : );
|
||||
opname = opnames( opi, : );
|
||||
|
||||
opsupname = strtrim( opsupname );
|
||||
opname = strtrim( opname );
|
||||
|
||||
str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str);
|
||||
|
||||
% Construct filenames for the data files from templates.
|
||||
file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname );
|
||||
file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname );
|
||||
file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname );
|
||||
file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname );
|
||||
file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname );
|
||||
|
||||
% Load the data files.
|
||||
%str = sprintf( ' Loading %s', file_blissup ); disp(str);
|
||||
run( file_blissup )
|
||||
run( file_blislpab )
|
||||
run( file_eigen )
|
||||
run( file_open )
|
||||
run( file_vend )
|
||||
|
||||
% Construct variable names for the variables in the data files.
|
||||
var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' );
|
||||
var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' );
|
||||
var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' );
|
||||
var_open = sprintf( vartemp, thr_str, opname, 'openblas' );
|
||||
var_vend = sprintf( vartemp, thr_str, opname, 'vendor' );
|
||||
|
||||
% Use eval() to instantiate the variable names constructed above,
|
||||
% copying each to a simplified name.
|
||||
data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : );
|
||||
data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : );
|
||||
data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : );
|
||||
data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : );
|
||||
data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : );
|
||||
|
||||
%str = sprintf( ' Reading %s', var_blissup ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_blislpab ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_eigen ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_open ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_bfeo ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_xsmm ); disp(str);
|
||||
%str = sprintf( ' Reading %s', var_vend ); disp(str);
|
||||
|
||||
% Plot one result in an m x n grid of plots, via the subplot()
|
||||
% function.
|
||||
if 1 == 1
|
||||
plot_l3sup_perf( opsupname, ...
|
||||
data_blissup, ...
|
||||
data_blislpab, ...
|
||||
data_eigen, ...
|
||||
data_open, ...
|
||||
data_vend, vend_str, ...
|
||||
nth, ...
|
||||
4, 7, ...
|
||||
cfreq, ...
|
||||
dflopspercycle, ...
|
||||
opi, impl );
|
||||
|
||||
clear data_mt_*gemm_*;
|
||||
clear data_blissup;
|
||||
clear data_blislpab;
|
||||
clear data_eigen;
|
||||
clear data_open;
|
||||
clear data_vend;
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
% Construct the name of the file to which we will output the graph.
|
||||
outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth );
|
||||
|
||||
% Output the graph to pdf format.
|
||||
%print(gcf, 'gemm_md','-fillpage','-dpdf');
|
||||
%print(gcf, outfile,'-bestfit','-dpdf');
|
||||
if impl == 'octave'
|
||||
print(gcf, outfile);
|
||||
else % if impl == 'matlab'
|
||||
print(gcf, outfile,'-bestfit','-dpdf');
|
||||
end
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
% kabylake
|
||||
plot_panel_trxsh(3.80,16,4,'mt','d','rrr',[ 6 8 10 ],'../results/kabylake/20200302/mnkt100000_mt4','kbl','MKL','octave'); close; clear all;
|
||||
|
||||
% haswell
|
||||
plot_panel_trxsh(3.1,16,12,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20200302/mnkt100000_mt12','has','MKL','octave'); close; clear all;
|
||||
|
||||
% epyc
|
||||
plot_panel_trxsh(2.55,8,32,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20200302/mnkt100000_mt32','epyc','MKL','octave'); close; clear all;
|
||||
@@ -1,8 +0,0 @@
|
||||
|
||||
% kabylake
|
||||
plot_panel_trxsh(3.8,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190619/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
|
||||
plot_panel_trxsh(3.8,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190619/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
|
||||
|
||||
% epyc
|
||||
plot_panel_trxsh(3.0,8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190619/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
|
||||
plot_panel_trxsh(3.0,8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190619/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
|
||||
@@ -1,8 +0,0 @@
|
||||
% kabylake
|
||||
plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20200302/mnkt100000_st','kbl','MKL','octave'); close; clear all;
|
||||
|
||||
% haswell
|
||||
plot_panel_trxsh(3.5,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20200302/mnkt100000_st','has','MKL','octave'); close; clear all;
|
||||
|
||||
% epyc
|
||||
plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20200302/mnkt100000_st','epyc','MKL','octave'); close; clear all;
|
||||
@@ -9,10 +9,7 @@ function r_val = plot_l3sup_perf( opname, ...
|
||||
cfreq, ...
|
||||
dfps, ...
|
||||
theid, impl )
|
||||
<<<<<<< HEAD
|
||||
|
||||
=======
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
%if ... %mod(theid-1,cols) == 2 || ...
|
||||
% ... %mod(theid-1,cols) == 3 || ...
|
||||
% ... %mod(theid-1,cols) == 4 || ...
|
||||
@@ -23,19 +20,11 @@ function r_val = plot_l3sup_perf( opname, ...
|
||||
%end
|
||||
|
||||
%legend_plot_id = 11;
|
||||
<<<<<<< HEAD
|
||||
legend_plot_id = 0*cols + 1*6;
|
||||
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
hold( ax1, 'on' );
|
||||
=======
|
||||
legend_plot_id = 1*cols + 1*5;
|
||||
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
hold( ax1, 'on' );
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
end
|
||||
|
||||
% Set line properties.
|
||||
@@ -89,15 +78,9 @@ end
|
||||
flopscol = size( data_blissup, 2 );
|
||||
msize = 5;
|
||||
if 1
|
||||
<<<<<<< HEAD
|
||||
fontsize = 12;
|
||||
else
|
||||
fontsize = 16;
|
||||
=======
|
||||
fontsize = 11;
|
||||
else
|
||||
fontsize = 16;
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
end
|
||||
linesize = 0.5;
|
||||
legend_loc = 'southeast';
|
||||
@@ -113,7 +96,6 @@ for psize_col = 1:3
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
<<<<<<< HEAD
|
||||
% Compute the number of data points we have in the x-axis. Note that we
|
||||
% only use half the data points for the m = n = k column of graphs.
|
||||
%if mod(theid-1,cols) == 6
|
||||
@@ -122,15 +104,6 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
% np = size( data_blissup, 1 );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
=======
|
||||
% Compute the number of data points we have in the x-axis. Note that
|
||||
% we only use quarter the data points for the m = n = k column of graphs.
|
||||
if mod(theid-1,cols) == 6
|
||||
np = size( data_blissup, 1 ) / 4;
|
||||
else
|
||||
np = size( data_blissup, 1 );
|
||||
end
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
|
||||
% Grab the last x-axis value.
|
||||
x_end = data_blissup( np, psize_col );
|
||||
@@ -154,12 +127,7 @@ open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
|
||||
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
<<<<<<< HEAD
|
||||
elseif theid == legend_plot_id
|
||||
=======
|
||||
else
|
||||
if theid == legend_plot_id
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
blissup_ln = line( nan, nan, ...
|
||||
'Color',color_blissup, 'LineStyle',lines_blissup, ...
|
||||
'LineWidth',linesize );
|
||||
@@ -176,16 +144,11 @@ vend_ln = line( nan, nan, ...
|
||||
'Color',color_vend, 'LineStyle',lines_vend, ...
|
||||
'LineWidth',linesize );
|
||||
end
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
end
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
|
||||
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
<<<<<<< HEAD
|
||||
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
|
||||
if nth == 12
|
||||
ylim( ax1, [y_begin y_end/2] );
|
||||
@@ -203,11 +166,6 @@ elseif 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
=======
|
||||
if 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
@@ -244,20 +202,12 @@ if show_plot == 1 || theid == legend_plot_id
|
||||
set( leg,'Color','none' );
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
<<<<<<< HEAD
|
||||
set( leg,'FontSize',fontsize );
|
||||
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
|
||||
=======
|
||||
set( leg,'FontSize',fontsize );
|
||||
set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
end
|
||||
set( leg,'Box','off' );
|
||||
set( leg,'Color','none' );
|
||||
@@ -274,7 +224,6 @@ box( ax1, 'on' );
|
||||
titl = title( titlename );
|
||||
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
|
||||
|
||||
<<<<<<< HEAD
|
||||
% The default is to align the plot title across whole figure, not the box.
|
||||
% This is a hack to nudge the title back to the center of the box.
|
||||
if impl == 'octave'
|
||||
@@ -300,19 +249,6 @@ if theid > (rows-1)*cols
|
||||
%tpos = get( xlab, 'Position' )
|
||||
%tpos(2) = tpos(2) + 10;
|
||||
%set( xlab, 'Position', tpos );
|
||||
=======
|
||||
if impl == 'octave'
|
||||
tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
|
||||
tpos(1) = tpos(1) + -40;
|
||||
set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
|
||||
end
|
||||
|
||||
if theid > (rows-1)*cols
|
||||
%xlab = xlabel( ax1,xaxisname );
|
||||
%tpos = get( xlab, 'Position' )
|
||||
%tpos(2) = tpos(2) + 10;
|
||||
%set( xlab, 'Position', tpos );
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
if theid == rows*cols - 6
|
||||
xlab = xlabel( ax1, 'm = 6; n = k' );
|
||||
elseif theid == rows*cols - 5
|
||||
@@ -331,19 +267,8 @@ if theid > (rows-1)*cols
|
||||
end
|
||||
|
||||
if mod(theid-1,cols) == 0
|
||||
<<<<<<< HEAD
|
||||
ylab = ylabel( ax1,yaxisname );
|
||||
end
|
||||
|
||||
=======
|
||||
ylab = ylabel( ax1,yaxisname );
|
||||
end
|
||||
|
||||
%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
|
||||
%saveas( fig, filename_png );
|
||||
|
||||
%hold( ax1, 'off' );
|
||||
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
r_val = 0;
|
||||
|
||||
|
||||
@@ -102,7 +102,6 @@ for psize_col = 1:3
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
<<<<<<< HEAD
|
||||
% Compute the number of data points we have in the x-axis. Note that we
|
||||
% only use half the data points for the m = n = k column of graphs.
|
||||
%if mod(theid-1,cols) == 6
|
||||
@@ -111,15 +110,6 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
% np = size( data_blissup, 1 );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
=======
|
||||
% Compute the number of data points we have in the x-axis. Note that
|
||||
% we only use half the data points for the m = n = k column of graphs.
|
||||
if mod(theid-1,cols) == 6
|
||||
np = size( data_blissup, 1 ) / 2;
|
||||
else
|
||||
np = size( data_blissup, 1 );
|
||||
end
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
|
||||
has_xsmm = 1;
|
||||
if data_xsmm( 1, flopscol ) == 0.0
|
||||
@@ -188,7 +178,6 @@ end
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
<<<<<<< HEAD
|
||||
if 10000 <= x_end && x_end < 15000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
@@ -199,12 +188,6 @@ elseif 6000 <= x_end && x_end < 10000
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ 2000 4000 6000 8000 ] );
|
||||
=======
|
||||
if 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
>>>>>>> Merged test/sup, test/supmt into test/sup.
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
x_tick1 = x_tick2/2;
|
||||
|
||||
@@ -20,7 +20,7 @@ function r_val = plot_l3sup_perf( opname, ...
|
||||
%end
|
||||
|
||||
%legend_plot_id = 11;
|
||||
legend_plot_id = 0*cols + 1*6;
|
||||
legend_plot_id = 0*cols + 1*4;
|
||||
|
||||
if 1
|
||||
ax1 = subplot( rows, cols, theid );
|
||||
@@ -96,14 +96,13 @@ for psize_col = 1:3
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
% Compute the number of data points we have in the x-axis. Note that we
|
||||
% only use half the data points for the m = n = k column of graphs.
|
||||
%if mod(theid-1,cols) == 6
|
||||
% np = size( data_blissup, 1 ) / 2;
|
||||
%else
|
||||
% np = size( data_blissup, 1 );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
% Compute the number of data points we have in the x-axis. Note that
|
||||
% we only use quarter the data points for the m = n = k column of graphs.
|
||||
if mod(theid-1,cols) == 6
|
||||
np = size( data_blissup, 1 ) / 4;
|
||||
else
|
||||
np = size( data_blissup, 1 );
|
||||
end
|
||||
|
||||
% Grab the last x-axis value.
|
||||
x_end = data_blissup( np, psize_col );
|
||||
@@ -149,23 +148,9 @@ end
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
|
||||
if nth == 12
|
||||
ylim( ax1, [y_begin y_end/2] );
|
||||
elseif nth > 12
|
||||
ylim( ax1, [y_begin y_end/6] );
|
||||
end
|
||||
end
|
||||
|
||||
if 10000 <= x_end && x_end < 15000
|
||||
if 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ 4000 8000 12000 ] );
|
||||
elseif 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
@@ -203,8 +188,7 @@ if show_plot == 1 || theid == legend_plot_id
|
||||
set( leg,'Units','inches' );
|
||||
if impl == 'octave'
|
||||
set( leg,'FontSize',fontsize );
|
||||
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
|
||||
else
|
||||
set( leg,'FontSize',fontsize-1 );
|
||||
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
|
||||
|
||||
@@ -102,14 +102,13 @@ for psize_col = 1:3
|
||||
end
|
||||
x_axis( :, 1 ) = data_blissup( :, psize_col );
|
||||
|
||||
% Compute the number of data points we have in the x-axis. Note that we
|
||||
% only use half the data points for the m = n = k column of graphs.
|
||||
%if mod(theid-1,cols) == 6
|
||||
% np = size( data_blissup, 1 ) / 2;
|
||||
%else
|
||||
% np = size( data_blissup, 1 );
|
||||
%end
|
||||
np = size( data_blissup, 1 );
|
||||
% Compute the number of data points we have in the x-axis. Note that
|
||||
% we only use half the data points for the m = n = k column of graphs.
|
||||
if mod(theid-1,cols) == 6
|
||||
np = size( data_blissup, 1 ) / 2;
|
||||
else
|
||||
np = size( data_blissup, 1 );
|
||||
end
|
||||
|
||||
has_xsmm = 1;
|
||||
if data_xsmm( 1, flopscol ) == 0.0
|
||||
@@ -178,16 +177,10 @@ end
|
||||
xlim( ax1, [x_begin x_end] );
|
||||
ylim( ax1, [y_begin y_end] );
|
||||
|
||||
if 10000 <= x_end && x_end < 15000
|
||||
if 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ 3000 6000 9000 12000 ] );
|
||||
elseif 6000 <= x_end && x_end < 10000
|
||||
x_tick2 = x_end - 2000;
|
||||
x_tick1 = x_tick2/2;
|
||||
%xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
xticks( ax1, [ 2000 4000 6000 8000 ] );
|
||||
xticks( ax1, [ x_tick1 x_tick2 ] );
|
||||
elseif 4000 <= x_end && x_end < 6000
|
||||
x_tick2 = x_end - 1000;
|
||||
x_tick1 = x_tick2/2;
|
||||
|
||||
Reference in New Issue
Block a user