Rebased amd-staging-milan-3.0 branch on master

-- Rebased on top of master commit # 6e522e5823
  -- Updated merged code to remove duplicated code added by auto-merging
  -- Updated merged code to rename bool_t type
  -- Updated merged code to rename bli_thread_obarrier
  -- Updated merged code to rename bli_thread_obroadcast

Change-Id: I39879f1ef3b42ecbe5808af3b559d88c36dbbf6c
AMD-Internal: [CPUPL-1067]
This commit is contained in:
dzambare
2020-08-03 12:59:37 +05:30
committed by Dipal M Zambare
parent 449ee37028
commit 392726d0e1
71 changed files with 30612 additions and 17255 deletions

View File

@@ -9683,7 +9683,7 @@ Date: Fri Feb 23 17:42:48 2018 -0600
CHANGELOG update (0.3.0)
commit 3defc7265c12cf85e9de2d7a1f243c5e090a6f9d (origin/master, origin/HEAD)
commit 3defc7265c12cf85e9de2d7a1f243c5e090a6f9d
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Fri Feb 23 17:38:19 2018 -0600
@@ -9719,7 +9719,7 @@ Date: Fri Feb 23 16:33:32 2018 -0600
contained. To remedy this situation, we now selectively use movss to
load any element that could be the last element in the matrix.
commit 5112e1859e7f8888f5555eb7bc02bd9fab9b4442 (origin/rt, rt)
commit 5112e1859e7f8888f5555eb7bc02bd9fab9b4442 (origin/rt)
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Fri Feb 23 14:31:26 2018 -0600
@@ -9951,7 +9951,7 @@ Date: Thu Jan 4 20:51:35 2018 -0600
time hardware detection (when clang is selected).
- Added some missing (but mostly-optional) quotes to configure script.
commit 5a7005dd44ed3174abbe360981e367fd41c99b4b (origin/amd, amd)
commit 5a7005dd44ed3174abbe360981e367fd41c99b4b
Merge: 7be88705 3bc99a96
Author: Nisanth M P <nisanth.padinharepatt@amd.com>
Date: Wed Jan 3 12:05:12 2018 +0530
@@ -10000,7 +10000,7 @@ Date: Sat Dec 23 15:32:03 2017 -0600
is used by the auto-detection script to printf() the name of the
sub-configuration corresponding to the detected hardware.
commit 9804adfd405056ec332bb8e13d68c7b52bd3a6c1 (origin/selfinit, selfinit)
commit 9804adfd405056ec332bb8e13d68c7b52bd3a6c1 (origin/selfinit)
Author: Field G. Van Zee <field@cs.utexas.edu>
Date: Thu Dec 21 19:22:57 2017 -0600

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2019, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,6 @@
# libraries.
#
# Copyright (C) 2019, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are

View File

@@ -53,9 +53,7 @@ COPTFLAGS := -O3 -fomit-frame-pointer
endif
# Flags specific to optimized kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) #-fomit-frame-pointer
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
else

View File

@@ -39,229 +39,242 @@
void bli_cntx_init_zen( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen_ref( cntx );
// Set default kernel blocksizes and functions.
bli_cntx_init_zen_ref( cntx );
// -------------------------------------------------------------------------
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
16,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
);
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
/*
<<<<<<< HEAD
Multi Instance performance improvement of DGEMM when binded to a CCX
In Multi instance each thread runs a sequential DGEMM.
a) If BLIS is run in a multi-instance mode with
CPU freq 2.6/2.2 Ghz
DDR4 clock frequency 2400Mhz
mc = 240, kc = 512, and nc = 2040
has better performance on EPYC server, over the default block sizes.
a) If BLIS is run in a multi-instance mode with
CPU freq 2.6/2.2 Ghz
DDR4 clock frequency 2400Mhz
mc = 240, kc = 512, and nc = 2040
has better performance on EPYC server, over the default block sizes.
b) If BLIS is run in Single Instance mode
mc = 510, kc = 1024 and nc = 4080
b) If BLIS is run in Single Instance mode
mc = 510, kc = 1024 and nc = 4080
*/
// Zen optmized level 3 cache block sizes
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
// Zen optmized level 3 cache block sizes
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
#endif
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM execution.
bli_cntx_set_trsm_blkszs
(
5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM execution.
bli_cntx_set_trsm_blkszs
(
5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 220, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 );
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
28,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
14,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}

View File

@@ -31,12 +31,6 @@ To summarize: In order to observe multithreaded parallelism within a BLIS operat
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
2. Specify multithreading at runtime. This is also discussed [later on](docs/Multithreading.md#specifying-multithreading).
**IMPORTANT**: Multithreading in BLIS is disabled by default. Furthermore, even when multithreading is enabled, BLIS will default to single-threaded execution at runtime. In order to both *allow* and *invoke* parallelism from within BLIS operations, you must both *enable* multithreading at configure-time and *specify* multithreading at runtime.
To summarize: In order to observe multithreaded parallelism within a BLIS operation, you must do *both* of the following:
1. Enable multithreading at configure-time. This is discussed in the [next section](docs/Multithreading.md#enabling-multithreading).
2. Specify multithreading at runtime. This is also dicussed [later on](docs/Multithreading.md#specifying-multithreading).
# Enabling multithreading
BLIS disables multithreading by default. In order to allow multithreaded parallelism from BLIS, you must first enable multithreading explicitly at configure-time.

View File

@@ -175,7 +175,7 @@ void PASTEMAC(opname,EX_SUF) \
\
/* If the rntm is non-NULL, it may indicate that we should forgo sup
handling altogether. */ \
bool_t enable_sup = TRUE; \
bool enable_sup = TRUE; \
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); \
\
if ( enable_sup ) \

View File

@@ -57,7 +57,7 @@ err_t bli_gemmsup_int
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
@@ -88,25 +88,25 @@ err_t bli_gemmsup_int
return BLIS_FAILURE;
}
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool_t auto_factor = bli_rntm_auto_factor( rntm );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool_t use_bp = TRUE;
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
@@ -247,3 +247,215 @@ err_t bli_gemmsup_int
return BLIS_SUCCESS;
}
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4);
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_4, alpha, a, b, beta, c);
#if 0
//bli_gemmsup_ref_var2
//bli_gemmsup_ref_var1
#if 0
bli_gemmsup_ref_var1n
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
if ( is_rrr_rrc_rcr_crr )
{
bli_gemmsup_ref_var2m
(
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
else
{
bli_gemmsup_ref_var2m
(
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
return BLIS_SUCCESS;
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) {
AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_4, "SUP doesn't support general stide.");
return BLIS_FAILURE;
}
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t n = bli_obj_width( c );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t mu = n / MR;
const dim_t nu = n / NR;
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of nc up to be a multiple of mr.
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = n / NR; // the m becomes n after a transposition
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of mc up to be a multiple of nr.
}
}
// Return success so that the caller knows that we computed the solution.
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
return BLIS_SUCCESS;
}

View File

@@ -43,3 +43,15 @@ err_t bli_gemmsup_int
rntm_t* rntm,
thrinfo_t* thread
);
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);

View File

@@ -68,7 +68,7 @@ void PASTEMAC(ch,opname) \
\
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -98,7 +98,7 @@ void PASTEMAC(ch,opname) \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -429,7 +429,7 @@ void PASTEMAC(ch,opname) \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5);\
}

View File

@@ -38,7 +38,7 @@
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t m, \
dim_t k, \
@@ -57,7 +57,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
\
void PASTEMAC(ch,opname) \
( \
bool_t did_pack, \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
@@ -71,7 +71,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t m, \
@@ -95,7 +95,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_a )
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \

View File

@@ -68,7 +68,7 @@ void PASTEMAC(ch,opname) \
\
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -98,7 +98,7 @@ void PASTEMAC(ch,opname) \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_obroadcast( thread, mem ); \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -429,7 +429,7 @@ void PASTEMAC(ch,opname) \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_obarrier( thread ); \
bli_thread_barrier( thread ); \
} \
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_5); \
}

View File

@@ -38,7 +38,7 @@
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t k, \
dim_t n, \
@@ -57,7 +57,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
\
void PASTEMAC(ch,opname) \
( \
bool_t did_pack, \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
@@ -71,7 +71,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t k, \
@@ -95,7 +95,7 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_b )
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \

View File

@@ -94,8 +94,8 @@ void PASTEMAC(ch,varname) \
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool_t row_stored = bli_is_col_packed( schema ); \
/*bool_t col_stored = bli_is_row_packed( schema );*/ \
bool row_stored = bli_is_col_packed( schema ); \
/*bool col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
@@ -372,8 +372,8 @@ void PASTEMAC(ch,varname) \
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool_t col_stored = bli_is_col_packed( schema ); \
/*bool_t row_stored = bli_is_row_packed( schema );*/ \
bool col_stored = bli_is_col_packed( schema ); \
/*bool row_stored = bli_is_row_packed( schema );*/ \
\
if ( col_stored ) \
{ \

View File

@@ -90,14 +90,6 @@ err_t bli_gemmsup_ref
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
#endif
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
// May not need these here since packm_sup infers the schemas based
// on the stor3_t id. (This would also mean that they don't need to
// be passed into the thread decorator below.)
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
return
bli_l3_sup_thread_decorator
@@ -114,3 +106,75 @@ err_t bli_gemmsup_ref
);
}
err_t bli_gemmtsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// This function implements the default gemmsup handler. If you are a
// BLIS developer and wish to use a different gemmsup handler, please
// register a different function pointer in the context in your
// sub-configuration's bli_cntx_init_*() function.
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
#if 0
// NOTE: This special case handling is done within the variants.
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return BLIS_SUCCESS;
}
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop.
bli_rntm_set_ways_from_rntm_sup
(
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
#if 0
printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) );
printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) );
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
#endif
return
bli_l3_sup_thread_decorator
(
bli_gemmtsup_int,
BLIS_GEMMT, // operation family id
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}

View File

@@ -120,9 +120,6 @@ void bli_gemmsup_ref_var1n
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -450,68 +447,6 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = m_local % NC; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree.
NOTE: These bszid_t values, and their order, match that of the bp
algorithm (variant 2) because they are not used to query actual
blocksizes but rather query the ways of parallelism for the various
loops. For example, the 2nd loop in variant 1 partitions in the m
dimension (in increments of MR), but parallelizes that m dimension
with BLIS_JR_NT. The only difference is that the _packa and _packb
arrays have been adjusted for the semantic difference in order in
which packa and packb nodes are encountered in the thrinfo tree.
That is, this panel-block algorithm partitions an NC x KC submatrix
of A to be packed in the 4th loop, and a KC x MC submatrix of B
to be packed in the 3rd loop. */ \
/* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t* restrict bszids; \
\
/* Set the bszids pointer to the correct bszids array above based on which
matrices (if any) are being packed. */ \
if ( packa ) { if ( packb ) bszids = bszids_packab; \
else bszids = bszids_packa; } \
else { if ( packb ) bszids = bszids_packb; \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jc = bszids; \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
const dim_t m_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = m_local % NC; \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -614,72 +549,6 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = n_local % MC; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing A. If we won't be packing A, we alias to
the _pc variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pa; \
if ( packa ) { bszids_pa = &bszids_pc[1]; \
thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
else { bszids_pa = &bszids_pc[0]; \
thread_pa = thread_pc; } \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id.
NOTE: packing matrix A in this panel-block algorithm corresponds
to packing matrix B in the block-panel algorithm. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
stor_id, /* a "panel of B". */ \
BLIS_NO_TRANSPOSE, \
NC, KC, /* This "panel of B" is (at most) NC x KC. */ \
nc_cur, kc_cur, MR, \
&one_local, \
a_pc, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_pc_use = a_use; \
\
/* We don't need to embed the panel stride of A within the auxinfo_t
object because this variant iterates through A in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_ic = &bszids_pa[1]; \
thread_ic = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
const dim_t n_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = n_local % MC; \
\
/* Loop over the n dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -765,81 +634,6 @@ void PASTEMAC(ch,varname) \
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing A. If we won't be packing A, we alias to
the _pc variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pb; \
if ( packb ) { bszids_pb = &bszids_ic[1]; \
thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
else { bszids_pb = &bszids_ic[0]; \
thread_pb = thread_ic; } \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then b_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id.
NOTE: packing matrix B in this panel-block algorithm corresponds
to packing matrix A in the block-panel algorithm. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
stor_id, /* a "block of A". */ \
BLIS_NO_TRANSPOSE, \
KC, MC, /* This "block of A" is (at most) KC x MC. */ \
kc_cur, mc_cur, NR, \
&one_local, \
b_ic, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_ic_use = b_use; \
\
/* Embed the panel stride of B within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of B. */ \
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jr = &bszids_pb[1]; \
thread_jr = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
dim_t jr_left = nc_cur % MR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* An optimization: allow the last jr iteration to contain up to MRE
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. NOTE: We forgo this optimization when packing A
since packing an extended edge case is not yet supported. */ \
if ( !packa && !is_mt ) \
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
{ \
jr_iter--; jr_left += MR; \
} \
\
/* Loop over the m dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -882,7 +676,7 @@ void PASTEMAC(ch,varname) \
\
/* NOTE: This barrier is only needed if we are packing A (since
that matrix is packed within the pc loop of this variant). */ \
if ( packa ) bli_thread_obarrier( thread_pa ); \
if ( packa ) bli_thread_barrier( thread_pa ); \
} \
} \
\
@@ -976,9 +770,6 @@ void bli_gemmsup_ref_var2m
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -1280,57 +1071,6 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t* restrict bszids; \
\
/* Set the bszids pointer to the correct bszids array above based on which
matrices (if any) are being packed. */ \
if ( packa ) { if ( packb ) bszids = bszids_packab; \
else bszids = bszids_packa; } \
else { if ( packb ) bszids = bszids_packb; \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jc = bszids; \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -1431,70 +1171,6 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing B, we alias to
the _pc variables so that code further down can unconditionally
reference the _pb variables. Note that *if* we will be packing
B, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pb; \
if ( packb ) { bszids_pb = &bszids_pc[1]; \
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
else { bszids_pb = &bszids_pc[0]; \
thread_pb = thread_pc; } \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then a_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
stor_id, /* a "panel of B." */ \
BLIS_NO_TRANSPOSE, \
KC, NC, /* This "panel of B" is (at most) KC x NC. */ \
kc_cur, nc_cur, NR, \
&one_local, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* We don't need to embed the panel stride of B within the auxinfo_t
object because this variant iterates through B in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -1578,79 +1254,6 @@ void PASTEMAC(ch,varname) \
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing A, we alias to
the _ic variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pa; \
if ( packa ) { bszids_pa = &bszids_ic[1]; \
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
else { bszids_pa = &bszids_ic[0]; \
thread_pa = thread_ic; } \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
stor_id, /* a "block of A." */ \
BLIS_NO_TRANSPOSE, \
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
mc_cur, kc_cur, MR, \
&one_local, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Embed the panel stride of A within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of A (if needed). */ \
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* An optimization: allow the last jr iteration to contain up to NRE
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. NOTE: We forgo this optimization when packing B
since packing an extended edge case is not yet supported. */ \
if ( !packb && !is_mt ) \
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
{ \
jr_iter--; jr_left += NR; \
} \
\
/* Loop over the n dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -1693,7 +1296,7 @@ void PASTEMAC(ch,varname) \
\
/* NOTE: This barrier is only needed if we are packing B (since
that matrix is packed within the pc loop of this variant). */ \
if ( packb ) bli_thread_obarrier( thread_pb ); \
if ( packb ) bli_thread_barrier( thread_pb ); \
} \
} \
\

View File

@@ -169,15 +169,15 @@ void bli_gemmt_front
// of the ccr or crc cases.
// Then, after the computation is complete, this matrix will be copied
// or accumulated back to C.
const bool_t is_ccr_mismatch =
const bool is_ccr_mismatch =
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_obj_is_col_stored( &c_local ) );
const bool_t is_crc_mismatch =
const bool is_crc_mismatch =
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
!bli_obj_is_row_stored( &c_local ) );
obj_t ct;
bool_t use_ct = FALSE;
bool use_ct = FALSE;
// FGVZ: Consider adding another guard here that only creates and uses a
// temporary matrix for accumulation if k < c * kc, where c is some small
@@ -284,24 +284,24 @@ void bli_gemmt_front
bli_obj_dt( a ) != bli_obj_dt( c ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
{
const bool_t a_is_real = bli_obj_is_real( a );
const bool_t a_is_comp = bli_obj_is_complex( a );
const bool_t b_is_real = bli_obj_is_real( b );
const bool_t b_is_comp = bli_obj_is_complex( b );
const bool_t c_is_real = bli_obj_is_real( c );
const bool_t c_is_comp = bli_obj_is_complex( c );
const bool a_is_real = bli_obj_is_real( a );
const bool a_is_comp = bli_obj_is_complex( a );
const bool b_is_real = bli_obj_is_real( b );
const bool b_is_comp = bli_obj_is_complex( b );
const bool c_is_real = bli_obj_is_real( c );
const bool c_is_comp = bli_obj_is_complex( c );
const bool_t a_is_single = bli_obj_is_single_prec( a );
const bool_t a_is_double = bli_obj_is_double_prec( a );
const bool_t b_is_single = bli_obj_is_single_prec( b );
const bool_t b_is_double = bli_obj_is_double_prec( b );
const bool_t c_is_single = bli_obj_is_single_prec( c );
const bool_t c_is_double = bli_obj_is_double_prec( c );
const bool a_is_single = bli_obj_is_single_prec( a );
const bool a_is_double = bli_obj_is_double_prec( a );
const bool b_is_single = bli_obj_is_single_prec( b );
const bool b_is_double = bli_obj_is_double_prec( b );
const bool c_is_single = bli_obj_is_single_prec( c );
const bool c_is_double = bli_obj_is_double_prec( c );
const bool_t comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
const bool_t comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
const bool comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
const bool comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
const bool_t mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
const bool mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
bli_obj_domain( c ) != bli_obj_domain( b );
( void )a_is_real; ( void )a_is_comp;

View File

@@ -124,7 +124,7 @@ void bli_gemmt_ker_var2
FUNCPTR_T f;
bool_t uploc;
bool uploc;
if ( bli_obj_is_lower( c ) )
{
uploc = 0;
@@ -251,7 +251,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
@@ -277,7 +277,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
* diagonal is reached, and is used to determine path during
* next iterations of loop
*/ \
bool_t flag = 0; \
bool flag = 0; \
auxinfo_t aux; \
\
/*
@@ -545,7 +545,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\

View File

@@ -38,8 +38,8 @@
typedef void (*FUNCPTR_T)
(
bool_t packa,
bool_t packb,
bool packa,
bool packb,
conj_t conja,
conj_t conjb,
dim_t m,
@@ -116,8 +116,8 @@ void bli_gemmtsup_ref_var1n
#else
const num_t dt = bli_obj_dt( c );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -162,7 +162,7 @@ void bli_gemmtsup_ref_var1n
cs_b = bli_obj_row_stride( b );
}
bool_t uploc;
bool uploc;
if( bli_obj_is_lower( c ) )
{
uploc = 0;
@@ -246,8 +246,8 @@ void bli_gemmtsup_ref_var1n
\
void PASTEMACT(ch,opname,uplo,varname) \
( \
bool_t packa, \
bool_t packb, \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
@@ -434,7 +434,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
@@ -720,8 +720,8 @@ INSERT_GENTFUNC_L( gemmtsup, ref_var1n )
\
void PASTEMACT(ch,opname,uplo,varname) \
( \
bool_t packa, \
bool_t packb, \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
@@ -908,7 +908,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
@@ -1250,8 +1250,8 @@ void bli_gemmtsup_ref_var2m
#else
const num_t dt = bli_obj_dt( c );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -1296,7 +1296,7 @@ void bli_gemmtsup_ref_var2m
cs_b = bli_obj_row_stride( b );
}
bool_t uploc;
bool uploc;
if ( bli_is_notrans ( trans ) )
uploc = bli_obj_is_lower( c ) ? 0 : 1;
@@ -1377,8 +1377,8 @@ void bli_gemmtsup_ref_var2m
\
void PASTEMACT(ch,opname,uplo,varname) \
( \
bool_t packa, \
bool_t packb, \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
@@ -1503,7 +1503,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
Since update routines only support row-major order,
col_pref flag is used to induce transpose to matrices before
passing to update routine whenever C is col-stored */ \
const bool_t col_pref = (rs_c == 1)? 1 : 0; \
const bool col_pref = (rs_c == 1)? 1 : 0; \
\
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
@@ -1553,7 +1553,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
@@ -1893,8 +1893,8 @@ INSERT_GENTFUNC_L( gemmtsup, ref_var2m )
\
void PASTEMACT(ch,opname,uplo,varname) \
( \
bool_t packa, \
bool_t packb, \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
@@ -2019,7 +2019,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
Since update routines only support row-major order,
col_pref flag is used to induce transpose to matrices before
passing to update routine whenever C is col-stored */ \
const bool_t col_pref = (rs_c == 1) ? 1 : 0; \
const bool col_pref = (rs_c == 1) ? 1 : 0; \
\
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
@@ -2071,7 +2071,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool_t is_mt = bli_rntm_calc_num_threads( rntm ); \
const bool is_mt = bli_rntm_calc_num_threads( rntm ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \

View File

@@ -107,8 +107,8 @@ GENPROT( gemmtsup_ref_var2m )
\
void PASTEMACT(ch,opname,uplo,varname) \
( \
bool_t packa, \
bool_t packb, \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -238,20 +238,20 @@ BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to
);
#endif
BLIS_EXPORT_BLIS void bli_blksz_reduce_def_to
void bli_blksz_reduce_def_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
BLIS_EXPORT_BLIS void bli_blksz_reduce_max_to
void bli_blksz_reduce_max_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize
dim_t bli_determine_blocksize
(
dir_t direct,
dim_t i,
@@ -261,7 +261,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize
cntx_t* cntx
);
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f
dim_t bli_determine_blocksize_f
(
dim_t i,
dim_t dim,
@@ -270,7 +270,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f
cntx_t* cntx
);
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_b
dim_t bli_determine_blocksize_b
(
dim_t i,
dim_t dim,
@@ -312,7 +312,6 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_trsm_b
#endif
dim_t bli_determine_blocksize_f_sub
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f_sub
(
dim_t i,
dim_t dim,
@@ -320,7 +319,7 @@ BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_f_sub
dim_t b_max
);
BLIS_EXPORT_BLIS dim_t bli_determine_blocksize_b_sub
dim_t bli_determine_blocksize_b_sub
(
dim_t i,
dim_t dim,

View File

@@ -59,9 +59,6 @@ double bli_clock_min_diff( double time_min, double time_start )
// - under a nanosecond
// is actually garbled due to the clocks being taken too closely together.
if ( time_min <= 0.0 ) time_min = time_min_prev;
// To genuinely measure time for an application taking more than an hour, the below
// line is commented. If wrongly measuring higher time we could always use previous_min.
/* else if ( time_min > 3600.0 ) time_min = time_min_prev; */
else if ( time_min < 1.0e-9 ) time_min = time_min_prev;
return time_min;

View File

@@ -84,6 +84,10 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
{
return cntx->bmults;
}
BLIS_INLINE blksz_t* bli_cntx_trsm_blkszs_buf( cntx_t* cntx )
{
return cntx->trsm_blkszs;
}
BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
{
return cntx->l3_vir_ukrs;
@@ -333,6 +337,16 @@ BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
return blksz;
}
BLIS_INLINE blksz_t* bli_cntx_get_trsm_blksz( bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_trsm_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
// Return the address of the blksz_t identified by bs_id.
return blksz;
}
BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );

View File

@@ -264,7 +264,7 @@ bool bli_cpuid_is_penryn
}
// -----------------------------------------------------------------------------
bool_t bli_cpuid_is_zen3
bool bli_cpuid_is_zen3
(
uint32_t family,
uint32_t model,
@@ -283,7 +283,7 @@ bool_t bli_cpuid_is_zen3
// Finally, check for specific models:
// - 0x00-0xff (THIS NEEDS UPDATING)
const bool_t is_arch
const bool is_arch
=
( 0x00 <= model && model <= 0xff );
@@ -292,7 +292,7 @@ bool_t bli_cpuid_is_zen3
return TRUE;
}
bool_t bli_cpuid_is_zen2
bool bli_cpuid_is_zen2
(
uint32_t family,
uint32_t model,

View File

@@ -37,10 +37,10 @@
// -----------------------------------------------------------------------------
dim_t bli_env_get_var( const char* env, dim_t fallback )
gint_t bli_env_get_var( const char* env, gint_t fallback )
{
dim_t r_val;
char* str;
gint_t r_val;
char* str;
// Query the environment variable and store the result in str.
str = getenv( env );
@@ -50,7 +50,7 @@ dim_t bli_env_get_var( const char* env, dim_t fallback )
{
// If there was no error, convert the string to an integer and
// prepare to return that integer.
r_val = strtol( str, NULL, 10 );
r_val = ( gint_t )strtol( str, NULL, 10 );
}
else
{

View File

@@ -37,7 +37,7 @@
#ifndef BLIS_ENV_H
#define BLIS_ENV_H
dim_t bli_env_get_var( const char* env, dim_t fallback );
gint_t bli_env_get_var( const char* env, gint_t fallback );
//void bli_env_set_var( const char* env, dim_t value );
#endif

View File

@@ -77,7 +77,7 @@ dim_t bli_pack_get_pack_b( void )
// ----------------------------------------------------------------------------
void bli_pack_set_pack_a( bool_t pack_a )
void bli_pack_set_pack_a( bool pack_a )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
@@ -93,7 +93,7 @@ void bli_pack_set_pack_a( bool_t pack_a )
// ----------------------------------------------------------------------------
void bli_pack_set_pack_b( bool_t pack_b )
void bli_pack_set_pack_b( bool pack_b )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
@@ -118,25 +118,25 @@ void bli_pack_init_rntm_from_env
// function is only called from bli_pack_init(), which is only called
// by bli_init_once().
bool_t pack_a;
bool_t pack_b;
bool pack_a;
bool pack_b;
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
// -1 if it is unset.
pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 );
gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 );
// Enforce the default behavior first, then check for affirmative FALSE, and
// finally assume anything else is TRUE.
if ( pack_a == -1 ) pack_a = FALSE; // default behavior
else if ( pack_a == 0 ) pack_a = FALSE; // zero is FALSE
else pack_a = TRUE; // anything else is TRUE
if ( pack_a_env == -1 ) pack_a = FALSE; // default behavior
else if ( pack_a_env == 0 ) pack_a = FALSE; // zero is FALSE
else pack_a = TRUE; // anything else is TRUE
if ( pack_b == -1 ) pack_b = FALSE; // default behavior
else if ( pack_b == 0 ) pack_b = FALSE; // zero is FALSE
else pack_b = TRUE; // anything else is TRUE
if ( pack_b_env == -1 ) pack_b = FALSE; // default behavior
else if ( pack_b_env == 0 ) pack_b = FALSE; // zero is FALSE
else pack_b = TRUE; // anything else is TRUE
#else

View File

@@ -40,8 +40,8 @@ void bli_pack_finalize( void );
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a );
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b );
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a );
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b );
void bli_pack_init_rntm_from_env( rntm_t* rntm );

View File

@@ -285,11 +285,11 @@ BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm )
BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm )
{
bli_rntm_set_pack_a( TRUE, rntm );
bli_rntm_set_pack_a( FALSE, rntm );
}
BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm )
{
bli_rntm_set_pack_b( TRUE, rntm );
bli_rntm_set_pack_b( FALSE, rntm );
}
BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
{
@@ -309,8 +309,8 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
.auto_factor = TRUE, \
.num_threads = -1, \
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
.pack_a = TRUE, \
.pack_b = TRUE, \
.pack_a = FALSE, \
.pack_b = FALSE, \
.l3_sup = TRUE, \
.sba_pool = NULL, \
.membrk = NULL, \

View File

@@ -584,15 +584,15 @@ BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 )
// offset-relate
static bool_t bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
BLIS_INLINE bool bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
{
return ( bool_t )
return ( bool )
( ( n_off + n - 1 ) < m_off );
}
static bool_t bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
BLIS_INLINE bool bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
{
return ( bool_t )
return ( bool )
( ( m_off + m - 1 ) < n_off );
}
// diag offset-related

View File

@@ -65,14 +65,13 @@ err_t bli_l3_sup_thread_decorator
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
// Access the pool_t* for thread 0 and embed it into the rntm.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
#if 0
#ifndef SKIP_THRINFO_TREE
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
@@ -83,10 +82,7 @@ err_t bli_l3_sup_thread_decorator
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
// There is only one thread id (for the thief thread).
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
@@ -97,24 +93,22 @@ err_t bli_l3_sup_thread_decorator
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
#ifndef SKIP_THRINFO_TREE
thrinfo_t* thread = NULL;
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
#else
// This optimization allows us to use one of the global thrinfo_t
// objects for single-threaded execution rather than grow one from
// scratch. The key is that bli_thrinfo_sup_grow(), which is called
// from within the variants, will immediately return if it detects
// that the thrinfo_t* passed into it is either
// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
( void )tid;
#endif
func
(
@@ -125,17 +119,12 @@ err_t bli_l3_sup_thread_decorator
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
#ifndef SKIP_THRINFO_TREE
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
bli_l3_sup_thrinfo_free( rntm_p, thread );
#endif
}

View File

@@ -1062,6 +1062,8 @@ void bli_thread_partition_2x2
{
*nt1 = ( work1 >= work2 ? n_thread : 1 );
*nt2 = ( work1 < work2 ? n_thread : 1 );
return;
}
*nt1 = 1;

View File

@@ -205,7 +205,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
// Broadcast the temporary array to all threads in the parent's
// communicator.
new_comms = bli_thread_obroadcast( thread_par, new_comms );
new_comms = bli_thread_broadcast( thread_par, new_comms );
// Chiefs in the child communicator allocate the communicator
// object and store it in the array element corresponding to the
@@ -213,7 +213,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
if ( child_comm_id == 0 )
new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// All threads create a new thrinfo_t node using the communicator
// that was created by their chief, as identified by parent_work_id.
@@ -229,7 +229,7 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
NULL // sub_node
);
bli_thread_obarrier( thread_par );
bli_thread_barrier( thread_par );
// The parent's chief thread frees the temporary array of thrcomm_t
// pointers.

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc
Copyright (C) 2016 - 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without

View File

@@ -4,11 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
<<<<<<< HEAD
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
=======
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
>>>>>>> Merged BLIS Release 1.3
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without

View File

@@ -4,11 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
<<<<<<< HEAD
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
=======
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
>>>>>>> Merged BLIS Release 1.3
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without

View File

@@ -4,8 +4,8 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,8 +4,8 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -233,7 +233,7 @@ static err_t bli_sgemm_small
float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta );
/*Beta Zero Check*/
bool_t is_beta_non_zero=0;
bool is_beta_non_zero=0;
if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){
is_beta_non_zero = 1;
}
@@ -1805,7 +1805,7 @@ static err_t bli_dgemm_small
//checking whether beta value is zero.
//if true, we should perform C=alpha * A*B operation
//instead of C = beta * C + alpha * (A * B)
bool_t is_beta_non_zero = 0;
bool is_beta_non_zero = 0;
if(!bli_obj_equals(beta, &BLIS_ZERO))
is_beta_non_zero = 1;
@@ -3362,7 +3362,7 @@ static err_t bli_sgemm_small_atbn
float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta );
/*Beta Zero Check*/
bool_t is_beta_non_zero=0;
bool is_beta_non_zero=0;
if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){
is_beta_non_zero = 1;
}
@@ -3843,7 +3843,7 @@ static err_t bli_dgemm_small_atbn
//check if beta is zero
//if true, we need to perform C = alpha * (A * B)
//instead of C = beta * C + alpha * (A * B)
bool_t is_beta_non_zero = 0;
bool is_beta_non_zero = 0;
if(!bli_obj_equals(beta,&BLIS_ZERO))
is_beta_non_zero = 1;

29812
output.testsuite Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,2 +1,2 @@
2
2.1
3
0.0

View File

@@ -1,9 +0,0 @@
% tx2
plot_panel_4x5(2.2,8,1, '../results/tx2/st', 'tx2', 'ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,28,'../results/tx2/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,56,'../results/tx2/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
% skx
plot_panel_4x5(2.0,32,1,'../results/skx/st/20190218','skx','MKL'); close; clear all;
plot_panel_4x5(2.0,32,26,'../results/skx/jc2ic13/20190218','skx_jc2ic13','MKL'); close; clear all;
plot_panel_4x5(2.0,32,52,'../results/skx/jc4ic13/20190218','skx_jc4ic13','MKL'); close; clear all;

View File

@@ -1,35 +0,0 @@
% tx2
plot_panel_4x5(2.20,8,1, 'st','../results/tx2/20190205/st', 'tx2', 'ARMPL'); close; clear all;
plot_panel_4x5(2.20,8,28,'1s','../results/tx2/20190205/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
plot_panel_4x5(2.20,8,56,'2s','../results/tx2/20190205/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
% skx
% pre-eigen:
%plot_panel_4x5(2.00,32,1, 'st','../results/skx/20190306/st', 'skx', 'MKL'); close; clear all;
%plot_panel_4x5(2.00,32,26,'1s','../results/skx/20190306/jc2ic13','skx_jc2ic13','MKL'); close; clear all;
%plot_panel_4x5(2.00,32,52,'2s','../results/skx/20190306/jc4ic13','skx_jc4ic13','MKL'); close; clear all;
% with eigen:
plot_panel_4x5(2.00,32,1, 'st','../results/skx/merged20190306_0328/st', 'skx', 'MKL',1); close; clear all;
plot_panel_4x5(2.00,32,26,'1s','../results/skx/merged20190306_0328/jc2ic13','skx_jc2ic13','MKL',1); close; clear all;
plot_panel_4x5(2.00,32,52,'2s','../results/skx/merged20190306_0328/jc4ic13','skx_jc4ic13','MKL',1); close; clear all;
% has
% pre-eigen:
%plot_panel_4x5(3.25,16,1, 'st','../results/has/20190206/st', 'has', 'MKL',1); close; clear all;
%plot_panel_4x5(3.00,16,12,'1s','../results/has/20190206/jc2ic3jr2','has_jc2ic3jr2','MKL',1); close; clear all;
%plot_panel_4x5(3.00,16,24,'2s','../results/has/20190206/jc4ic3jr2','has_jc4ic3jr2','MKL',1); close; clear all;
% with eigen:
plot_panel_4x5(3.25,16,1, 'st','../results/has/merged20190206_0328/st', 'has', 'MKL',1); close; clear all;
plot_panel_4x5(3.00,16,12,'1s','../results/has/merged20190206_0328/jc2ic3jr2','has_jc2ic3jr2','MKL',1); close; clear all;
plot_panel_4x5(3.00,16,24,'2s','../results/has/merged20190206_0328/jc4ic3jr2','has_jc4ic3jr2','MKL',1); close; clear all;
% epyc
% pre-eigen:
%plot_panel_4x5(3.00,8,1, 'st','../results/epyc/merged201903_0619/st','epyc', 'MKL'); close; clear all;
%plot_panel_4x5(2.55,8,32,'1s','../results/epyc/merged201903_0619/jc1ic8jr4','epyc_jc1ic8jr4','MKL'); close; clear all;
%plot_panel_4x5(2.55,8,64,'2s','../results/epyc/merged201903_0619/jc2ic8jr4','epyc_jc2ic8jr4','MKL'); close; clear all;
% with eigen:
plot_panel_4x5(3.00,8,1, 'st','../results/epyc/merged20190306_0319_0328/st', 'epyc', 'MKL',1); close; clear all;
plot_panel_4x5(2.55,8,32,'1s','../results/epyc/merged20190306_0319_0328/jc1ic8jr4','epyc_jc1ic8jr4','MKL',1); close; clear all;
plot_panel_4x5(2.55,8,64,'2s','../results/epyc/merged20190306_0319_0328/jc2ic8jr4','epyc_jc2ic8jr4','MKL',1); close; clear all;

View File

@@ -5,7 +5,6 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -4,8 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are

View File

@@ -505,7 +505,6 @@ eigen-mt: check-env $(EIGEN_MT_BINS)
openblas-mt: check-env $(OPENBLAS_MT_BINS)
vendor-mt: check-env $(VENDOR_MT_BINS)
# -- Multithreaded --
# --- Object file rules --------------------------------------------------------
@@ -541,29 +540,6 @@ $(eval $(call make-st-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(ld),$(imp
# -- Multithreaded BLAS --
# -- Multithreaded BLAS --
# Define the function that will be used to instantiate compilation rules
# for the various multithreaded implementations.
define make-mt-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
$(CC) $(CFLAGS) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
endef
# Instantiate the rule function make-mt-rule() for each BLIS/BLAS/CBLAS
# implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS_MT), \
$(foreach sn,$(SNS_MT), \
$(foreach sk,$(SKS_MT), \
$(foreach impl,$(BIMPLS_MT), \
$(eval $(call make-mt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# -- Single-threaded Eigen --
# Define the function that will be used to instantiate compilation rules
# for the various multithreaded implementations.
define make-mt-rule
@@ -626,26 +602,6 @@ $(foreach ld,$(LDIMS), \
$(foreach impl,$(EIMPLS), \
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(ld),$(impl))))))))))))
# -- Multithreaded Eigen --
# Define the function that will be used to instantiate compilation rules
# for the multithreaded Eigen implementation.
define make-eigmt-rule
test_$(1)gemm_$(call stripu,$(2))_$(call stripu,$(3))_$(call get-shape-dim-str,$(4),$(5),$(6),$(7))_$(8)_mt.o: test_gemm.c Makefile
$(CXX) $(CXXFLAGS_MT) $(ERRCHK) $(N_TRIALS) $(call get-pdefs,$(4)) $(call get-dt-cpp,$(1)) $(call get-tra-defs,$(2)) $(call get-sto-defs,$(3)) $(call get-shape-defs,$(4),$(5),$(6),$(7)) $(call get-imp-defs,$(8)) $(STR_MT) -c $$< -o $$@
endef
# Instantiate the rule function make-st-rule() for each Eigen implementation.
$(foreach dt,$(DTS), \
$(foreach tr,$(TRANS), \
$(foreach st,$(STORS), \
$(foreach sh,$(SHAPES), \
$(foreach sm,$(SMS_MT), \
$(foreach sn,$(SNS_MT), \
$(foreach sk,$(SKS_MT), \
$(foreach impl,$(EIMPLS), \
$(eval $(call make-eigmt-rule,$(dt),$(tr),$(st),$(sh),$(sm),$(sn),$(sk),$(impl)))))))))))
# --- Executable file rules ----------------------------------------------------

View File

@@ -1,29 +1,49 @@
function r_val = plot_l3sup_perf( opname, ...
smalldims, ...
data_blissup, ...
data_blislpab, ...
data_blisconv, ...
data_eigen, ...
data_open, ...
data_vend, vend_str, ...
data_bfeo, ...
data_xsmm, ...
data_vend, vend_str, ...
nth, ...
rows, cols, ...
cfreq, ...
dfps, ...
theid, impl )
%if ... %mod(theid-1,cols) == 2 || ...
% ... %mod(theid-1,cols) == 3 || ...
% ... %mod(theid-1,cols) == 4 || ...
% 0 == 1 ... %theid >= 19
% show_plot = 0;
% Define the column in which the performance rates are found.
flopscol = size( data_blissup, 2 );
% Check if blasfeo data is available.
has_bfeo = 1;
if data_bfeo( 1, flopscol ) == 0.0
has_bfeo = 0;
end
% Check if libxsmm data is available.
has_xsmm = 1;
if data_xsmm( 1, flopscol ) == 0.0
has_xsmm = 0;
end
% Define which plot id will have the legend.
% NOTE: We can draw the legend on any graph as long as it has already been
% rendered. Since the coordinates are global, we can simply always wait until
% the final graph to draw the legend.
%if nth == 1
% if has_xsmm == 1
% legend_plot_id = 2*cols + 1*5;
% else
% legend_plot_id = 1*cols + 1*5;
% end
%else
show_plot = 1;
% legend_plot_id = 0*cols + 1*6;
%end
legend_plot_id = cols*rows;
%legend_plot_id = 11;
legend_plot_id = 2*cols + 1*5;
% Hold the axes.
if 1
ax1 = subplot( rows, cols, theid );
hold( ax1, 'on' );
@@ -31,12 +51,12 @@ end
% Set line properties.
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
color_blisconv = 'k'; lines_blisconv = ':'; markr_blisconv = '';
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
color_open = 'r'; lines_open = '--'; markr_open = 'o';
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
color_bfeo = 'c'; lines_bfeo = '-'; markr_bfeo = 'o';
color_xsmm = 'g'; lines_xsmm = '-'; markr_xsmm = 'o';
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
% Compute the peak performance in terms of the number of double flops
% executable per cycle and the clock rate.
@@ -55,15 +75,13 @@ titlename = '%s';
titlename = sprintf( titlename, title_opname );
% Set the legend strings.
blissup_legend = sprintf( 'BLIS sup' );
blislpab_legend = sprintf( 'BLIS conv' );
eigen_legend = sprintf( 'Eigen' );
open_legend = sprintf( 'OpenBLAS' );
bfeo_legend = sprintf( 'BLASFEO' );
xsmm_legend = sprintf( 'libxsmm' );
%vend_legend = sprintf( 'MKL' );
%vend_legend = sprintf( 'ARMPL' );
vend_legend = vend_str;
blissup_lg = sprintf( 'BLIS sup' );
blisconv_lg = sprintf( 'BLIS conv' );
eigen_lg = sprintf( 'Eigen' );
open_lg = sprintf( 'OpenBLAS' );
vend_lg = vend_str;
bfeo_lg = sprintf( 'BLASFEO' );
xsmm_lg = sprintf( 'libxsmm' );
% Set axes range values.
y_scale = 1.00;
@@ -81,7 +99,6 @@ end
%flopscol = 4;
flopscol = size( data_blissup, 2 );
msize = 5;
if 1
fontsize = 12;
@@ -111,67 +128,44 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
%end
np = size( data_blissup, 1 );
has_xsmm = 1;
if data_xsmm( 1, flopscol ) == 0.0
has_xsmm = 0;
end
% Grab the last x-axis value.
x_end = data_blissup( np, psize_col );
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
if show_plot == 1
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blisconv_ln = line( x_axis( 1:np, 1 ), data_blisconv( 1:np, flopscol ) / nth, ...
'Color',color_blisconv, 'LineStyle',lines_blisconv, ...
'LineWidth',linesize );
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ...
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
'LineWidth',linesize );
if has_xsmm == 1
xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ...
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
'LineWidth',linesize );
else
xsmm_ln = line( nan, nan, ...
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
'LineWidth',linesize );
end
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
elseif theid == legend_plot_id
blissup_ln = line( nan, nan, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( nan, nan, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
eigen_ln = line( nan, nan, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( nan, nan, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
bfeo_ln = line( nan, nan, ...
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
'LineWidth',linesize );
xsmm_ln = line( nan, nan, ...
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
'LineWidth',linesize );
vend_ln = line( nan, nan, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
if has_bfeo == 1
bfeo_ln = line( x_axis( 1:np, 1 ), data_bfeo( 1:np, flopscol ) / nth, ...
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
'LineWidth',linesize );
else
bfeo_ln = line( nan, nan, ...
'Color',color_bfeo, 'LineStyle',lines_bfeo, ...
'LineWidth',linesize );
end
if has_xsmm == 1
xsmm_ln = line( x_axis( 1:np, 1 ), data_xsmm( 1:np, flopscol ) / nth, ...
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
'LineWidth',linesize );
else
xsmm_ln = line( nan, nan, ...
'Color',color_xsmm, 'LineStyle',lines_xsmm, ...
'LineWidth',linesize );
end
@@ -203,71 +197,51 @@ elseif 500 <= x_end && x_end < 1000
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
end
if show_plot == 1 || theid == legend_plot_id
if nth == 1 && theid == legend_plot_id
if has_xsmm == 1
leg = legend( ...
[ ...
blissup_ln ...
blislpab_ln ...
eigen_ln ...
open_ln ...
bfeo_ln ...
xsmm_ln ...
vend_ln ...
], ...
blissup_legend, ...
blislpab_legend, ...
eigen_legend, ...
open_legend, ...
bfeo_legend, ...
xsmm_legend, ...
vend_legend, ...
'Location', legend_loc );
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[15.40 4.75 1.9 1.20] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-3 );
set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl)
end
% xpos ypos
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
if nth == 1 && theid == legend_plot_id
if has_xsmm == 1
% single-threaded, with libxsmm (ccc)
leg = legend( ...
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln bfeo_ln xsmm_ln ], ...
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, bfeo_lg, xsmm_lg, ...
'Location', legend_loc );
set( leg,'Box','off','Color','none','Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[15.35 4.62 1.9 1.20] ); % (1,4tl)
else
leg = legend( ...
[ ...
blissup_ln ...
blislpab_ln ...
eigen_ln ...
open_ln ...
bfeo_ln ...
vend_ln ...
], ...
blissup_legend, ...
blislpab_legend, ...
eigen_legend, ...
open_legend, ...
bfeo_legend, ...
vend_legend, ...
'Location', legend_loc );
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[15.40 7.65 1.9 1.10] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
end
set( leg,'FontSize',fontsize-3 );
set( leg,'Position',[18.20 10.20 1.15 0.7 ] ); % (1,4tl)
end
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
% xpos ypos
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
elseif nth > 1 && theid == legend_plot_id
else
% single-threaded, without libxsmm (rrr, or other)
leg = legend( ...
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln bfeo_ln ], ...
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, bfeo_lg, ...
'Location', legend_loc );
set( leg,'Box','off','Color','none','Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[15.35 7.40 1.9 1.10] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
end
end
elseif nth > 1 && theid == legend_plot_id
% multithreaded
leg = legend( ...
[ blissup_ln blisconv_ln eigen_ln open_ln vend_ln ], ...
blissup_lg, blisconv_lg, eigen_lg, open_lg, vend_lg, ...
'Location', legend_loc );
set( leg,'Box','off','Color','none','Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
set( leg,'Position',[18.20 10.30 1.9 0.95] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7] ); % (1,4tl)
end
end
@@ -298,28 +272,38 @@ else % impl == 'matlab'
set( titl, 'Position', tpos );
end
sll_str = sprintf( 'm = %u; n = k', smalldims(1) );
lsl_str = sprintf( 'n = %u; m = k', smalldims(2) );
lls_str = sprintf( 'k = %u; m = n', smalldims(3) );
lss_str = sprintf( 'm; n = %u, k = %u', smalldims(2), smalldims(3) );
sls_str = sprintf( 'n; m = %u, k = %u', smalldims(1), smalldims(3) );
ssl_str = sprintf( 'k; m = %u, n = %u', smalldims(1), smalldims(2) );
lll_str = sprintf( 'm = n = k' );
% Place labels on the bottom row of graphs.
if theid > (rows-1)*cols
%xlab = xlabel( ax1,xaxisname );
%tpos = get( xlab, 'Position' )
%tpos(2) = tpos(2) + 10;
%set( xlab, 'Position', tpos );
if theid == rows*cols - 6
xlab = xlabel( ax1, 'm = 6; n = k' );
xlab = xlabel( ax1, sll_str );
elseif theid == rows*cols - 5
xlab = xlabel( ax1, 'n = 8; m = k' );
xlab = xlabel( ax1, lsl_str );
elseif theid == rows*cols - 4
xlab = xlabel( ax1, 'k = 4; m = n' );
xlab = xlabel( ax1, lls_str );
elseif theid == rows*cols - 3
xlab = xlabel( ax1, 'm; n = 8, k = 4' );
xlab = xlabel( ax1, lss_str );
elseif theid == rows*cols - 2
xlab = xlabel( ax1, 'n; m = 6, k = 4' );
xlab = xlabel( ax1, sls_str );
elseif theid == rows*cols - 1
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
xlab = xlabel( ax1, ssl_str );
elseif theid == rows*cols - 0
xlab = xlabel( ax1, 'm = n = k' );
xlab = xlabel( ax1, lll_str );
end
end
% Place labels on the left-hand column of graphs.
if mod(theid-1,cols) == 0
ylab = ylabel( ax1,yaxisname );
end

View File

@@ -37,7 +37,6 @@ filetemp_blissup = '%s/output_%s_%s_blissup.m';
filetemp_blisconv = '%s/output_%s_%s_blisconv.m';
filetemp_eigen = '%s/output_%s_%s_eigen.m';
filetemp_open = '%s/output_%s_%s_openblas.m';
filetemp_bfeo = '%s/output_%s_%s_blasfeo.m';
filetemp_vend = '%s/output_%s_%s_vendor.m';
filetemp_bfeo = '%s/output_%s_%s_blasfeo.m';
filetemp_xsmm = '%s/output_%s_%s_libxsmm.m';
@@ -107,7 +106,6 @@ for opi = 1:n_opsupnames
data_blisconv, ...
data_eigen, ...
data_open, ...
data_bfeo, ...
data_vend, vend_str, ...
data_bfeo, ...
data_xsmm, ...

View File

@@ -1,52 +0,0 @@
function [ r_val1, r_val2 ] = gen_opsupnames( ops, stor, smalldims )
nops = size( ops, 1 );
smallm = smalldims( 1 );
smalln = smalldims( 2 );
smallk = smalldims( 3 );
i = 1;
for io = 1:nops
op = ops( io, : );
str0 = sprintf( '%s_%s_m%dnpkp', op, stor, smallm );
str1 = sprintf( '%s_%s_mpn%dkp', op, stor, smalln );
str2 = sprintf( '%s_%s_mpnpk%d', op, stor, smallk );
str3 = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk );
str4 = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk );
str5 = sprintf( '%s_%s_m%dn%dkp', op, stor, smallm, smalln );
str6 = sprintf( '%s_%s_mpnpkp', op, stor );
%opsupnames( i+0, : ) = sprintf( '%s_%s_m%dnpkp ', op, stor, smallm )
%opsupnames( i+1, : ) = sprintf( '%s_%s_mpn%dkp ', op, stor, smalln )
%opsupnames( i+2, : ) = sprintf( '%s_%s_mpnpk%d', op, stor, smallk )
%opsupnames( i+3, : ) = sprintf( '%s_%s_mpn%dk%d', op, stor, smalln, smallk )
%opsupnames( i+4, : ) = sprintf( '%s_%s_m%dnpk%d', op, stor, smallm, smallk )
%opsupnames( i+5, : ) = sprintf( '%s_%s_m%dn%dkp ', op, stor, smallm, smalln )
%opsupnames( i+6, : ) = sprintf( '%s_%s_mpnpkp ', op, stor )
opsupnames( i+0, : ) = sprintf( '%-20s', str0 );
opsupnames( i+1, : ) = sprintf( '%-20s', str1 );
opsupnames( i+2, : ) = sprintf( '%-20s', str2 );
opsupnames( i+3, : ) = sprintf( '%-20s', str3 );
opsupnames( i+4, : ) = sprintf( '%-20s', str4 );
opsupnames( i+5, : ) = sprintf( '%-20s', str5 );
opsupnames( i+6, : ) = sprintf( '%-20s', str6 );
opnames( i+0, : ) = sprintf( '%s', op );
opnames( i+1, : ) = sprintf( '%s', op );
opnames( i+2, : ) = sprintf( '%s', op );
opnames( i+3, : ) = sprintf( '%s', op );
opnames( i+4, : ) = sprintf( '%s', op );
opnames( i+5, : ) = sprintf( '%s', op );
opnames( i+6, : ) = sprintf( '%s', op );
i = i + 7;
end
r_val1 = opsupnames;
r_val2 = opnames;

View File

@@ -1,274 +0,0 @@
function r_val = plot_l3sup_perf( opname, ...
data_blissup, ...
data_blislpab, ...
data_eigen, ...
data_open, ...
data_vend, vend_str, ...
nth, ...
rows, cols, ...
cfreq, ...
dfps, ...
theid, impl )
%if ... %mod(theid-1,cols) == 2 || ...
% ... %mod(theid-1,cols) == 3 || ...
% ... %mod(theid-1,cols) == 4 || ...
% 0 == 1 ... %theid >= 19
% show_plot = 0;
%else
show_plot = 1;
%end
%legend_plot_id = 11;
legend_plot_id = 0*cols + 1*6;
if 1
ax1 = subplot( rows, cols, theid );
hold( ax1, 'on' );
end
% Set line properties.
color_blissup = 'k'; lines_blissup = '-'; markr_blissup = '';
color_blislpab = 'k'; lines_blislpab = ':'; markr_blislpab = '';
color_eigen = 'm'; lines_eigen = '-.'; markr_eigen = 'o';
color_open = 'r'; lines_open = '--'; markr_open = 'o';
color_vend = 'b'; lines_vend = '-.'; markr_vend = '.';
% Compute the peak performance in terms of the number of double flops
% executable per cycle and the clock rate.
if opname(1) == 's' || opname(1) == 'c'
flopspercycle = dfps * 2;
else
flopspercycle = dfps;
end
max_perf_core = (flopspercycle * cfreq) * 1;
% Escape underscores in the title.
title_opname = strrep( opname, '_', '\_' );
% Print the title to a string.
titlename = '%s';
titlename = sprintf( titlename, title_opname );
% Set the legend strings.
blissup_legend = sprintf( 'BLIS sup' );
blislpab_legend = sprintf( 'BLIS conv' );
eigen_legend = sprintf( 'Eigen' );
open_legend = sprintf( 'OpenBLAS' );
%vend_legend = sprintf( 'MKL' );
%vend_legend = sprintf( 'ARMPL' );
vend_legend = vend_str;
% Set axes range values.
y_scale = 1.00;
x_begin = 0;
%x_end is set below.
y_begin = 0;
y_end = max_perf_core * y_scale;
% Set axes names.
if nth == 1
yaxisname = 'GFLOPS';
else
yaxisname = 'GFLOPS/core';
end
%flopscol = 4;
flopscol = size( data_blissup, 2 );
msize = 5;
if 1
fontsize = 12;
else
fontsize = 16;
end
linesize = 0.5;
legend_loc = 'southeast';
% --------------------------------------------------------------------
% Automatically detect a column with the increasing problem size.
% Then set the maximum x-axis value.
for psize_col = 1:3
if data_blissup( 1, psize_col ) ~= data_blissup( 2, psize_col )
break;
end
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
% Compute the number of data points we have in the x-axis. Note that we
% only use half the data points for the m = n = k column of graphs.
%if mod(theid-1,cols) == 6
% np = size( data_blissup, 1 ) / 2;
%else
% np = size( data_blissup, 1 );
%end
np = size( data_blissup, 1 );
% Grab the last x-axis value.
x_end = data_blissup( np, psize_col );
%data_peak( 1, 1:2 ) = [ 0 max_perf_core ];
%data_peak( 2, 1:2 ) = [ x_end max_perf_core ];
if show_plot == 1
blissup_ln = line( x_axis( 1:np, 1 ), data_blissup( 1:np, flopscol ) / nth, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( x_axis( 1:np, 1 ), data_blislpab( 1:np, flopscol ) / nth, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
eigen_ln = line( x_axis( 1:np, 1 ), data_eigen( 1:np, flopscol ) / nth, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
elseif theid == legend_plot_id
blissup_ln = line( nan, nan, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
blislpab_ln = line( nan, nan, ...
'Color',color_blislpab, 'LineStyle',lines_blislpab, ...
'LineWidth',linesize );
eigen_ln = line( nan, nan, ...
'Color',color_eigen, 'LineStyle',lines_eigen, ...
'LineWidth',linesize );
open_ln = line( nan, nan, ...
'Color',color_open, 'LineStyle',lines_open, ...
'LineWidth',linesize );
vend_ln = line( nan, nan, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
end
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
if nth == 12
ylim( ax1, [y_begin y_end/2] );
elseif nth > 12
ylim( ax1, [y_begin y_end/6] );
end
end
if 10000 <= x_end && x_end < 15000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ 4000 8000 12000 ] );
elseif 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 2000 <= x_end && x_end < 3000
x_tick2 = x_end - 400;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 500 <= x_end && x_end < 1000
x_tick3 = x_end*(3/4);
x_tick2 = x_end*(2/4);
x_tick1 = x_end*(1/4);
xticks( ax1, [ x_tick1 x_tick2 x_tick3 ] );
end
if show_plot == 1 || theid == legend_plot_id
if theid == legend_plot_id
leg = legend( ...
[ ...
blissup_ln ...
blislpab_ln ...
eigen_ln ...
open_ln ...
vend_ln ...
], ...
blissup_legend, ...
blislpab_legend, ...
eigen_legend, ...
open_legend, ...
vend_legend, ...
'Location', legend_loc );
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
end
set( leg,'Box','off' );
set( leg,'Color','none' );
set( leg,'Units','inches' );
% xpos ypos
%set( leg,'Position',[11.32 6.36 1.15 0.7 ] ); % (1,4tl)
end
end
set( ax1,'FontSize',fontsize );
set( ax1,'TitleFontSizeMultiplier',1.0 ); % default is 1.1.
box( ax1, 'on' );
titl = title( titlename );
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
% The default is to align the plot title across whole figure, not the box.
% This is a hack to nudge the title back to the center of the box.
if impl == 'octave'
tpos = get( titl, 'Position' );
% For some reason, the titles in the graphs in the last column start
% off in a different relative position than the graphs in the other
% columns. Here, we manually account for that.
if mod(theid-1,cols) == 6
tpos(1) = tpos(1) + -10;
else
tpos(1) = tpos(1) + -40;
end
set( titl, 'Position', tpos );
set( titl, 'FontSize', fontsize );
else % impl == 'matlab'
tpos = get( titl, 'Position' );
tpos(1) = tpos(1) + 90;
set( titl, 'Position', tpos );
end
if theid > (rows-1)*cols
%xlab = xlabel( ax1,xaxisname );
%tpos = get( xlab, 'Position' )
%tpos(2) = tpos(2) + 10;
%set( xlab, 'Position', tpos );
if theid == rows*cols - 6
xlab = xlabel( ax1, 'm = 6; n = k' );
elseif theid == rows*cols - 5
xlab = xlabel( ax1, 'n = 8; m = k' );
elseif theid == rows*cols - 4
xlab = xlabel( ax1, 'k = 10; m = n' );
elseif theid == rows*cols - 3
xlab = xlabel( ax1, 'm; n = 8, k = 10' );
elseif theid == rows*cols - 2
xlab = xlabel( ax1, 'n; m = 6, k = 10' );
elseif theid == rows*cols - 1
xlab = xlabel( ax1, 'k; m = 6, n = 8' );
elseif theid == rows*cols - 0
xlab = xlabel( ax1, 'm = n = k' );
end
end
if mod(theid-1,cols) == 0
ylab = ylabel( ax1,yaxisname );
end
r_val = 0;

View File

@@ -1,152 +0,0 @@
function r_val = plot_panel_trxsh ...
( ...
cfreq, ...
dflopspercycle, ...
nth, ...
thr_str, ...
dt_ch, ...
stor_str, ...
smalldims, ...
dirpath, ...
arch_str, ...
vend_str, ...
impl ...
)
%cfreq = 1.8;
%dflopspercycle = 32;
% Create filename "templates" for the files that contain the performance
% results.
filetemp_blissup = '%s/output_%s_%s_blissup.m';
filetemp_blislpab = '%s/output_%s_%s_blislpab.m';
filetemp_eigen = '%s/output_%s_%s_eigen.m';
filetemp_open = '%s/output_%s_%s_openblas.m';
filetemp_vend = '%s/output_%s_%s_vendor.m';
% Create a variable name "template" for the variables contained in the
% files outlined above.
vartemp = 'data_%s_%s_%s( :, : )';
% Define the datatypes and operations we will be plotting.
oproot = sprintf( '%cgemm', dt_ch );
ops( 1, : ) = sprintf( '%s_nn', oproot );
ops( 2, : ) = sprintf( '%s_nt', oproot );
ops( 3, : ) = sprintf( '%s_tn', oproot );
ops( 4, : ) = sprintf( '%s_tt', oproot );
% Generate datatype-specific operation names from the set of operations
% and datatypes.
[ opsupnames, opnames ] = gen_opsupnames( ops, stor_str, smalldims );
n_opsupnames = size( opsupnames, 1 );
%opsupnames
%opnames
%return
if 1 == 1
%fig = figure('Position', [100, 100, 2400, 1500]);
fig = figure('Position', [100, 100, 2400, 1200]);
orient( fig, 'portrait' );
set(gcf,'PaperUnits', 'inches');
if impl == 'matlab'
set(gcf,'PaperSize', [11.5 20.4]);
set(gcf,'PaperPosition', [0 0 11.5 20.4]);
set(gcf,'PaperPositionMode','manual');
else % impl == 'octave' % octave 4.x
set(gcf,'PaperSize', [12 21.5]);
set(gcf,'PaperPositionMode','auto');
end
set(gcf,'PaperOrientation','landscape');
end
% Iterate over the list of datatype-specific operation names.
for opi = 1:n_opsupnames
%for opi = 1:1
% Grab the current datatype combination.
opsupname = opsupnames( opi, : );
opname = opnames( opi, : );
opsupname = strtrim( opsupname );
opname = strtrim( opname );
str = sprintf( 'Plotting %2d: %s', opi, opsupname ); disp(str);
% Construct filenames for the data files from templates.
file_blissup = sprintf( filetemp_blissup, dirpath, thr_str, opsupname );
file_blislpab = sprintf( filetemp_blislpab, dirpath, thr_str, opsupname );
file_eigen = sprintf( filetemp_eigen, dirpath, thr_str, opsupname );
file_open = sprintf( filetemp_open, dirpath, thr_str, opsupname );
file_vend = sprintf( filetemp_vend, dirpath, thr_str, opsupname );
% Load the data files.
%str = sprintf( ' Loading %s', file_blissup ); disp(str);
run( file_blissup )
run( file_blislpab )
run( file_eigen )
run( file_open )
run( file_vend )
% Construct variable names for the variables in the data files.
var_blissup = sprintf( vartemp, thr_str, opname, 'blissup' );
var_blislpab = sprintf( vartemp, thr_str, opname, 'blislpab' );
var_eigen = sprintf( vartemp, thr_str, opname, 'eigen' );
var_open = sprintf( vartemp, thr_str, opname, 'openblas' );
var_vend = sprintf( vartemp, thr_str, opname, 'vendor' );
% Use eval() to instantiate the variable names constructed above,
% copying each to a simplified name.
data_blissup = eval( var_blissup ); % e.g. data_st_dgemm_blissup( :, : );
data_blislpab = eval( var_blislpab ); % e.g. data_st_dgemm_blislpab( :, : );
data_eigen = eval( var_eigen ); % e.g. data_st_dgemm_eigen( :, : );
data_open = eval( var_open ); % e.g. data_st_dgemm_openblas( :, : );
data_vend = eval( var_vend ); % e.g. data_st_dgemm_vendor( :, : );
%str = sprintf( ' Reading %s', var_blissup ); disp(str);
%str = sprintf( ' Reading %s', var_blislpab ); disp(str);
%str = sprintf( ' Reading %s', var_eigen ); disp(str);
%str = sprintf( ' Reading %s', var_open ); disp(str);
%str = sprintf( ' Reading %s', var_bfeo ); disp(str);
%str = sprintf( ' Reading %s', var_xsmm ); disp(str);
%str = sprintf( ' Reading %s', var_vend ); disp(str);
% Plot one result in an m x n grid of plots, via the subplot()
% function.
if 1 == 1
plot_l3sup_perf( opsupname, ...
data_blissup, ...
data_blislpab, ...
data_eigen, ...
data_open, ...
data_vend, vend_str, ...
nth, ...
4, 7, ...
cfreq, ...
dflopspercycle, ...
opi, impl );
clear data_mt_*gemm_*;
clear data_blissup;
clear data_blislpab;
clear data_eigen;
clear data_open;
clear data_vend;
end
end
% Construct the name of the file to which we will output the graph.
outfile = sprintf( 'l3sup_%s_%s_%s_nt%d.pdf', oproot, stor_str, arch_str, nth );
% Output the graph to pdf format.
%print(gcf, 'gemm_md','-fillpage','-dpdf');
%print(gcf, outfile,'-bestfit','-dpdf');
if impl == 'octave'
print(gcf, outfile);
else % if impl == 'matlab'
print(gcf, outfile,'-bestfit','-dpdf');
end

View File

@@ -1,8 +0,0 @@
% kabylake
plot_panel_trxsh(3.80,16,4,'mt','d','rrr',[ 6 8 10 ],'../results/kabylake/20200302/mnkt100000_mt4','kbl','MKL','octave'); close; clear all;
% haswell
plot_panel_trxsh(3.1,16,12,'mt','d','rrr',[ 6 8 10 ],'../results/haswell/20200302/mnkt100000_mt12','has','MKL','octave'); close; clear all;
% epyc
plot_panel_trxsh(2.55,8,32,'mt','d','rrr',[ 6 8 10 ],'../results/epyc/20200302/mnkt100000_mt32','epyc','MKL','octave'); close; clear all;

View File

@@ -1,8 +0,0 @@
% kabylake
plot_panel_trxsh(3.8,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20190619/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
plot_panel_trxsh(3.8,16,1,'st','d','ccc',[ 6 8 4 ],'../results/kabylake/20190619/4_800_4_mt201','kbl','MKL','matlab'); close; clear all;
% epyc
plot_panel_trxsh(3.0,8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20190619/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;
plot_panel_trxsh(3.0,8,1,'st','d','ccc',[ 6 8 4 ],'../results/epyc/20190619/4_800_4_mt256','epyc','MKL','matlab'); close; clear all;

View File

@@ -1,8 +0,0 @@
% kabylake
plot_panel_trxsh(3.80,16,1,'st','d','rrr',[ 6 8 4 ],'../results/kabylake/20200302/mnkt100000_st','kbl','MKL','octave'); close; clear all;
% haswell
plot_panel_trxsh(3.5,16,1,'st','d','rrr',[ 6 8 4 ],'../results/haswell/20200302/mnkt100000_st','has','MKL','octave'); close; clear all;
% epyc
plot_panel_trxsh(3.00, 8,1,'st','d','rrr',[ 6 8 4 ],'../results/epyc/20200302/mnkt100000_st','epyc','MKL','octave'); close; clear all;

View File

@@ -9,10 +9,7 @@ function r_val = plot_l3sup_perf( opname, ...
cfreq, ...
dfps, ...
theid, impl )
<<<<<<< HEAD
=======
>>>>>>> Merged test/sup, test/supmt into test/sup.
%if ... %mod(theid-1,cols) == 2 || ...
% ... %mod(theid-1,cols) == 3 || ...
% ... %mod(theid-1,cols) == 4 || ...
@@ -23,19 +20,11 @@ function r_val = plot_l3sup_perf( opname, ...
%end
%legend_plot_id = 11;
<<<<<<< HEAD
legend_plot_id = 0*cols + 1*6;
if 1
ax1 = subplot( rows, cols, theid );
hold( ax1, 'on' );
=======
legend_plot_id = 1*cols + 1*5;
if 1
ax1 = subplot( rows, cols, theid );
hold( ax1, 'on' );
>>>>>>> Merged test/sup, test/supmt into test/sup.
end
% Set line properties.
@@ -89,15 +78,9 @@ end
flopscol = size( data_blissup, 2 );
msize = 5;
if 1
<<<<<<< HEAD
fontsize = 12;
else
fontsize = 16;
=======
fontsize = 11;
else
fontsize = 16;
>>>>>>> Merged test/sup, test/supmt into test/sup.
end
linesize = 0.5;
legend_loc = 'southeast';
@@ -113,7 +96,6 @@ for psize_col = 1:3
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
<<<<<<< HEAD
% Compute the number of data points we have in the x-axis. Note that we
% only use half the data points for the m = n = k column of graphs.
%if mod(theid-1,cols) == 6
@@ -122,15 +104,6 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
% np = size( data_blissup, 1 );
%end
np = size( data_blissup, 1 );
=======
% Compute the number of data points we have in the x-axis. Note that
% we only use quarter the data points for the m = n = k column of graphs.
if mod(theid-1,cols) == 6
np = size( data_blissup, 1 ) / 4;
else
np = size( data_blissup, 1 );
end
>>>>>>> Merged test/sup, test/supmt into test/sup.
% Grab the last x-axis value.
x_end = data_blissup( np, psize_col );
@@ -154,12 +127,7 @@ open_ln = line( x_axis( 1:np, 1 ), data_open( 1:np, flopscol ) / nth, ...
vend_ln = line( x_axis( 1:np, 1 ), data_vend( 1:np, flopscol ) / nth, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
<<<<<<< HEAD
elseif theid == legend_plot_id
=======
else
if theid == legend_plot_id
>>>>>>> Merged test/sup, test/supmt into test/sup.
blissup_ln = line( nan, nan, ...
'Color',color_blissup, 'LineStyle',lines_blissup, ...
'LineWidth',linesize );
@@ -176,16 +144,11 @@ vend_ln = line( nan, nan, ...
'Color',color_vend, 'LineStyle',lines_vend, ...
'LineWidth',linesize );
end
<<<<<<< HEAD
=======
end
>>>>>>> Merged test/sup, test/supmt into test/sup.
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
<<<<<<< HEAD
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
if nth == 12
ylim( ax1, [y_begin y_end/2] );
@@ -203,11 +166,6 @@ elseif 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
=======
if 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
>>>>>>> Merged test/sup, test/supmt into test/sup.
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
@@ -244,20 +202,12 @@ if show_plot == 1 || theid == legend_plot_id
set( leg,'Color','none' );
set( leg,'Units','inches' );
if impl == 'octave'
<<<<<<< HEAD
set( leg,'FontSize',fontsize );
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
=======
set( leg,'FontSize',fontsize );
set( leg,'Position',[12.50 10.35 1.5 0.9 ] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)
>>>>>>> Merged test/sup, test/supmt into test/sup.
end
set( leg,'Box','off' );
set( leg,'Color','none' );
@@ -274,7 +224,6 @@ box( ax1, 'on' );
titl = title( titlename );
set( titl, 'FontWeight', 'normal' ); % default font style is now 'bold'.
<<<<<<< HEAD
% The default is to align the plot title across whole figure, not the box.
% This is a hack to nudge the title back to the center of the box.
if impl == 'octave'
@@ -300,19 +249,6 @@ if theid > (rows-1)*cols
%tpos = get( xlab, 'Position' )
%tpos(2) = tpos(2) + 10;
%set( xlab, 'Position', tpos );
=======
if impl == 'octave'
tpos = get( titl, 'Position' ); % default is to align across whole figure, not box.
tpos(1) = tpos(1) + -40;
set( titl, 'Position', tpos ); % here we nudge it back to centered with box.
end
if theid > (rows-1)*cols
%xlab = xlabel( ax1,xaxisname );
%tpos = get( xlab, 'Position' )
%tpos(2) = tpos(2) + 10;
%set( xlab, 'Position', tpos );
>>>>>>> Merged test/sup, test/supmt into test/sup.
if theid == rows*cols - 6
xlab = xlabel( ax1, 'm = 6; n = k' );
elseif theid == rows*cols - 5
@@ -331,19 +267,8 @@ if theid > (rows-1)*cols
end
if mod(theid-1,cols) == 0
<<<<<<< HEAD
ylab = ylabel( ax1,yaxisname );
end
=======
ylab = ylabel( ax1,yaxisname );
end
%export_fig( filename, colorflag, '-pdf', '-m2', '-painters', '-transparent' );
%saveas( fig, filename_png );
%hold( ax1, 'off' );
>>>>>>> Merged test/sup, test/supmt into test/sup.
r_val = 0;

View File

@@ -102,7 +102,6 @@ for psize_col = 1:3
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
<<<<<<< HEAD
% Compute the number of data points we have in the x-axis. Note that we
% only use half the data points for the m = n = k column of graphs.
%if mod(theid-1,cols) == 6
@@ -111,15 +110,6 @@ x_axis( :, 1 ) = data_blissup( :, psize_col );
% np = size( data_blissup, 1 );
%end
np = size( data_blissup, 1 );
=======
% Compute the number of data points we have in the x-axis. Note that
% we only use half the data points for the m = n = k column of graphs.
if mod(theid-1,cols) == 6
np = size( data_blissup, 1 ) / 2;
else
np = size( data_blissup, 1 );
end
>>>>>>> Merged test/sup, test/supmt into test/sup.
has_xsmm = 1;
if data_xsmm( 1, flopscol ) == 0.0
@@ -188,7 +178,6 @@ end
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
<<<<<<< HEAD
if 10000 <= x_end && x_end < 15000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
@@ -199,12 +188,6 @@ elseif 6000 <= x_end && x_end < 10000
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ 2000 4000 6000 8000 ] );
=======
if 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );
>>>>>>> Merged test/sup, test/supmt into test/sup.
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
x_tick1 = x_tick2/2;

View File

@@ -20,7 +20,7 @@ function r_val = plot_l3sup_perf( opname, ...
%end
%legend_plot_id = 11;
legend_plot_id = 0*cols + 1*6;
legend_plot_id = 0*cols + 1*4;
if 1
ax1 = subplot( rows, cols, theid );
@@ -96,14 +96,13 @@ for psize_col = 1:3
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
% Compute the number of data points we have in the x-axis. Note that we
% only use half the data points for the m = n = k column of graphs.
%if mod(theid-1,cols) == 6
% np = size( data_blissup, 1 ) / 2;
%else
% np = size( data_blissup, 1 );
%end
np = size( data_blissup, 1 );
% Compute the number of data points we have in the x-axis. Note that
% we only use quarter the data points for the m = n = k column of graphs.
if mod(theid-1,cols) == 6
np = size( data_blissup, 1 ) / 4;
else
np = size( data_blissup, 1 );
end
% Grab the last x-axis value.
x_end = data_blissup( np, psize_col );
@@ -149,23 +148,9 @@ end
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
if mod(theid-1,cols) == 3 || mod(theid-1,cols) == 4 || mod(theid-1,cols) == 5
if nth == 12
ylim( ax1, [y_begin y_end/2] );
elseif nth > 12
ylim( ax1, [y_begin y_end/6] );
end
end
if 10000 <= x_end && x_end < 15000
if 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ 4000 8000 12000 ] );
elseif 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
@@ -203,8 +188,7 @@ if show_plot == 1 || theid == legend_plot_id
set( leg,'Units','inches' );
if impl == 'octave'
set( leg,'FontSize',fontsize );
%set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
set( leg,'Position',[18.80 10.60 1.9 0.95 ] ); % (1,4tl)
set( leg,'Position',[12.40 10.60 1.9 0.95 ] ); % (1,4tl)
else
set( leg,'FontSize',fontsize-1 );
set( leg,'Position',[18.24 10.15 1.15 0.7 ] ); % (1,4tl)

View File

@@ -102,14 +102,13 @@ for psize_col = 1:3
end
x_axis( :, 1 ) = data_blissup( :, psize_col );
% Compute the number of data points we have in the x-axis. Note that we
% only use half the data points for the m = n = k column of graphs.
%if mod(theid-1,cols) == 6
% np = size( data_blissup, 1 ) / 2;
%else
% np = size( data_blissup, 1 );
%end
np = size( data_blissup, 1 );
% Compute the number of data points we have in the x-axis. Note that
% we only use half the data points for the m = n = k column of graphs.
if mod(theid-1,cols) == 6
np = size( data_blissup, 1 ) / 2;
else
np = size( data_blissup, 1 );
end
has_xsmm = 1;
if data_xsmm( 1, flopscol ) == 0.0
@@ -178,16 +177,10 @@ end
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
if 10000 <= x_end && x_end < 15000
if 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ 3000 6000 9000 12000 ] );
elseif 6000 <= x_end && x_end < 10000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
%xticks( ax1, [ x_tick1 x_tick2 ] );
xticks( ax1, [ 2000 4000 6000 8000 ] );
xticks( ax1, [ x_tick1 x_tick2 ] );
elseif 4000 <= x_end && x_end < 6000
x_tick2 = x_end - 1000;
x_tick1 = x_tick2/2;