mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Code cleanup: spelling corrections
Corrections for spelling and other mistakes in code comments and doc files. AMD-Internal: [CPUPL-2870] Change-Id: Ifbb5df7df2d6312fe73e06ee6d41c00b16c593ce
This commit is contained in:
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
|
||||
}
|
||||
|
||||
// Extra space since packing does width in multiples of 16. The bf16
|
||||
// instruction can be used as long as atleast one zmm register can be fully
|
||||
// loaded; and since k_dim needs to be atleast 2, having n_dim atleast 16
|
||||
// instruction can be used as long as at least one zmm register can be fully
|
||||
// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
|
||||
// should give 2x16=32 elements, enough for 1 zmm register.The padding is
|
||||
// not rounded to NR (=64), since that would result in memory wastage.
|
||||
dim_t n_reorder = make_multiple_of_n( n, 16 );
|
||||
|
||||
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16)
|
||||
}
|
||||
|
||||
// Extra space since packing does width in multiples of 16. The vpmaddubsw
|
||||
// instruction can be used as long as atleast one ymm register can be fully
|
||||
// loaded; and since k_dim needs to be at least 2, having n_dim atleast 16
|
||||
// instruction can be used as long as at least one ymm register can be fully
|
||||
// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
|
||||
// should give 2x16=32 elements, enough for 1 ymm register.The padding is
|
||||
// not rounded to NR (=16), since that would result in memory wastage.
|
||||
dim_t n_reorder = make_multiple_of_n(n, 16);
|
||||
|
||||
@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32)
|
||||
}
|
||||
|
||||
// Extra space since packing does width in multiples of 16. The vnni
|
||||
// instruction can be used as long as atleast one zmm register can be fully
|
||||
// loaded; and since k_dim needs to be atleast 4, having n_dim atleast 16
|
||||
// instruction can be used as long as at least one zmm register can be fully
|
||||
// loaded; and since k_dim needs to be at least 4, having n_dim at least 16
|
||||
// should give 4x16=64 elements, enough for 1 zmm register.The padding is
|
||||
// not rounded to NR (=64), since that would result in memory wastage.
|
||||
dim_t n_reorder = make_multiple_of_n( n, 16 );
|
||||
|
||||
@@ -1891,7 +1891,7 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s
|
||||
|
||||
### Operation implementation type query
|
||||
|
||||
The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
|
||||
The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implementation query](BLISTypedAPI.md#microkernel-implementation-type-query).
|
||||
```c
|
||||
char* bli_info_get_gemm_impl_string( num_t dt );
|
||||
char* bli_info_get_hemm_impl_string( num_t dt );
|
||||
|
||||
@@ -143,7 +143,7 @@ void bli_dgemv_unf_var1
|
||||
|
||||
conja = bli_extract_conj(transa);
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -460,7 +460,7 @@ void bli_sgemv_unf_var1
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
|
||||
@@ -177,7 +177,7 @@ void bli_dgemv_unf_var2
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -247,7 +247,7 @@ void bli_dgemv_unf_var2
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/* beta=0 case is handled by scalv internally */
|
||||
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
@@ -448,7 +448,7 @@ void bli_sgemv_unf_var2
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -516,7 +516,7 @@ void bli_sgemv_unf_var2
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/* beta=0 case is handled by scalv internally */
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -835,7 +835,7 @@ void bli_cgemv_unf_var2
|
||||
|
||||
/* If beta is zero, use setv. Otherwise, scale by beta. */
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
/* beta=0 case is handled by scalv internally */
|
||||
/*bli_cscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
@@ -846,7 +846,7 @@ void bli_cgemv_unf_var2
|
||||
cntx
|
||||
);*/
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
|
||||
@@ -316,7 +316,7 @@ void bli_dhemv_unf_var1
|
||||
* factor. */
|
||||
/* Assign kernel function pointer and fusing factor. */
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -312,7 +312,7 @@ void bli_dhemv_unf_var3
|
||||
|
||||
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -295,7 +295,7 @@ void bli_dtrsv_unf_var1
|
||||
|
||||
PASTECH(d,dotxf_ker_ft) kfp_df;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_df = bli_ddotxf_zen_int_8;
|
||||
@@ -496,7 +496,7 @@ void bli_strsv_unf_var1
|
||||
|
||||
PASTECH(s,dotxf_ker_ft) kfp_df;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_df = bli_sdotxf_zen_int_8;
|
||||
|
||||
@@ -297,7 +297,7 @@ void bli_dtrsv_unf_var2
|
||||
|
||||
PASTECH(d,axpyf_ker_ft) kfp_af;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_af = bli_daxpyf_zen_int_16x4;
|
||||
@@ -496,7 +496,7 @@ void bli_strsv_unf_var2
|
||||
|
||||
PASTECH(s, axpyf_ker_ft) kfp_af;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_af = bli_saxpyf_zen_int_5;
|
||||
@@ -695,7 +695,7 @@ void bli_ztrsv_unf_var2
|
||||
|
||||
PASTECH(z, axpyf_ker_ft) kfp_af;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_af = bli_zaxpyf_zen_int_5;
|
||||
@@ -893,7 +893,7 @@ void bli_ctrsv_unf_var2
|
||||
|
||||
PASTECH(c, axpyf_ker_ft) kfp_af;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
kfp_af = bli_caxpyf_zen_int_5;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -149,7 +149,7 @@ err_t bli_gemm_smart_threading_sup
|
||||
{
|
||||
err_t ret_val = BLIS_FAILURE;
|
||||
|
||||
// Sanity check, max available threads should be atleast 4 for the
|
||||
// Sanity check, max available threads should be at least 4 for the
|
||||
// smart threading/factorization to be meaningful. For nt < 4 the
|
||||
// default ic,jc factorization holds good.
|
||||
if ( ( m <= 1 ) || ( n <= 1 ) || ( k <= 1 ) || ( max_available_nt < 4 ) )
|
||||
|
||||
@@ -803,7 +803,7 @@ void bli_gemm_md_zgemm
|
||||
}
|
||||
|
||||
{
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -150,7 +151,7 @@ void bli_hemm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -119,7 +120,7 @@ void bli_her2k_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -99,7 +100,7 @@ void bli_herk_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -149,7 +150,7 @@ void bli_symm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -100,7 +101,7 @@ void bli_syr2k_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -94,7 +94,7 @@ void bli_syrk_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -168,7 +168,7 @@ void bli_trmm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -177,7 +177,7 @@ void bli_trmm_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -160,7 +161,7 @@ void bli_trmm3_front
|
||||
rntm
|
||||
);
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
|
||||
@@ -154,7 +154,7 @@ void bli_trsm_front
|
||||
// not impact the global cntx object.
|
||||
cntx_t cntx_trsm = *cntx;
|
||||
|
||||
// A sort of hack for communicating the desired pach schemas for A and B
|
||||
// A sort of hack for communicating the desired pack schemas for A and B
|
||||
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion, particularly
|
||||
@@ -165,11 +165,11 @@ void bli_trsm_front
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
* for TRSM (Till we implement TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels use different block sizes then AVX512 kernels
|
||||
* Here we override the default block sizes in the context with AVX2
|
||||
* specific block size used in GEMMTRSM kernerls.
|
||||
* specific block size used in GEMMTRSM kernerls.
|
||||
*
|
||||
* We need to revisit this when TRSM AVX-512 kernels are implemented.
|
||||
*/
|
||||
@@ -182,7 +182,7 @@ void bli_trsm_front
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
|
||||
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
|
||||
}
|
||||
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
|
||||
else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT )
|
||||
{
|
||||
pack_t schema_a = bli_cntx_schema_a_block( &cntx_trsm );
|
||||
pack_t schema_b = bli_cntx_schema_b_panel( &cntx_trsm );
|
||||
|
||||
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
* for TRSM (Till we implement TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
|
||||
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
* for TRSM (Till we implement TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
|
||||
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
* for TRSM (Till we implement TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
|
||||
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Zen4 TRSM Fixme:
|
||||
*
|
||||
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
|
||||
* for TRSM (Till we implemente TRSM AVX-512 kernels)
|
||||
* for TRSM (Till we implement TRSM AVX-512 kernels)
|
||||
*
|
||||
* The AVX2 kernels for TRSM are enabled in the context, but they
|
||||
* are compatible with only AVX2 version of GEMM kernels.
|
||||
|
||||
@@ -393,7 +393,7 @@ void bli_gks_register_cntx
|
||||
|
||||
// At this point, we know the pointer to the array of cntx_t* is NULL and
|
||||
// needs to be allocated. Allocate the memory and initialize it to
|
||||
// zeros/NULL, storing the address of the alloacted memory at the element
|
||||
// zeros/NULL, storing the address of the allocated memory at the element
|
||||
// for the current architecture id.
|
||||
gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS );
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ f77_int isamax_blis_impl
|
||||
incx0 = ( inc_t )(*incx);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -499,7 +499,7 @@ void caxpy_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -603,7 +603,7 @@ void zaxpy_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -162,7 +162,7 @@ void scopy_blis_impl
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -267,7 +267,7 @@ void dcopy_blis_impl
|
||||
incy0 = (inc_t)(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -584,7 +584,7 @@ scomplex cdotu_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -696,7 +696,7 @@ dcomplex zdotu_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -810,7 +810,7 @@ scomplex cdotc_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -922,7 +922,7 @@ dcomplex zdotc_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -512,7 +512,7 @@ void dgemm_blis_impl
|
||||
const inc_t rs_c = 1;
|
||||
const inc_t cs_c = *ldc;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -681,7 +681,7 @@ void dgemm_blis_impl
|
||||
bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
|
||||
|
||||
#ifdef AOCL_DYNAMIC
|
||||
//For smaller sizes dgemm_small is perfoming better
|
||||
//For smaller sizes dgemm_small is performing better
|
||||
if (is_parallel && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>150)) )
|
||||
#else
|
||||
if (is_parallel)
|
||||
|
||||
@@ -283,7 +283,7 @@ void dgemv_blis_impl
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -482,7 +482,7 @@ void sgemv_blis_impl
|
||||
rs_a = 1;
|
||||
cs_a = *lda;
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
|
||||
@@ -155,7 +155,7 @@ void sswap_blis_impl
|
||||
incy0 = ( inc_t )(*incy);
|
||||
}
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
/* Call BLIS kernel */
|
||||
@@ -255,7 +255,7 @@ void dswap_blis_impl
|
||||
}
|
||||
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
|
||||
bli_dswapv_zen_int8
|
||||
|
||||
@@ -706,7 +706,7 @@ void strsm_blis_impl
|
||||
bli_obj_set_struc( struca, &ao );
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -1014,7 +1014,7 @@ void dtrsm_blis_impl
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -1449,7 +1449,7 @@ void ztrsm_blis_impl
|
||||
bli_obj_set_struc( struca, &ao );
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
@@ -1817,7 +1817,7 @@ void ctrsm_blis_impl
|
||||
bli_obj_set_struc( struca, &ao );
|
||||
|
||||
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
|
||||
{
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -255,8 +255,8 @@ void bli_samaxv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
/* Store final index to output variable. */
|
||||
@@ -743,8 +743,8 @@ static void bli_vec_search_double
|
||||
/*
|
||||
Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
This avoids a performance penalty caused by false dependencies when
|
||||
transitioning from from AVX to SSE instructions (which may occur
|
||||
as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
transitioning from AVX to SSE instructions (which may occur as soon
|
||||
as the n_left cleanup loop below if BLIS is compiled with
|
||||
-mfpmath=sse).
|
||||
*/
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -150,8 +150,8 @@ void bli_saxpbyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -282,8 +282,8 @@ void bli_daxpbyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -634,8 +634,8 @@ void bli_caxpbyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -1063,8 +1063,8 @@ void bli_zaxpbyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -1146,4 +1146,4 @@ void bli_zaxpbyv_zen_int
|
||||
}
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -374,8 +374,8 @@ void bli_saxpbyv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -680,8 +680,8 @@ void bli_daxpbyv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -706,4 +706,4 @@ void bli_daxpbyv_zen_int10
|
||||
}
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -138,8 +138,8 @@ void bli_saxpyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -242,8 +242,8 @@ void bli_daxpyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
|
||||
@@ -307,9 +307,10 @@ void bli_saxpyv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
|
||||
_mm256_zeroupper();
|
||||
|
||||
for ( ; (i + 0) < n; i += 1 )
|
||||
@@ -583,8 +584,8 @@ void bli_daxpyv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -638,8 +639,8 @@ void bli_caxpyv_zen_int5
|
||||
float alphaR, alphaI;
|
||||
|
||||
//scomplex alpha => aR + aI i
|
||||
__m256 alphaRv; // for braodcast vector aR (real part of alpha)
|
||||
__m256 alphaIv; // for braodcast vector aI (imaginary part of alpha)
|
||||
__m256 alphaRv; // for broadcast vector aR (real part of alpha)
|
||||
__m256 alphaIv; // for broadcast vector aI (imaginary part of alpha)
|
||||
__m256 xv[10];
|
||||
__m256 xShufv[10];
|
||||
__m256 yv[10];
|
||||
@@ -837,8 +838,8 @@ void bli_caxpyv_zen_int5
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -947,8 +948,8 @@ void bli_zaxpyv_zen_int5
|
||||
{
|
||||
const dim_t n_elem_per_reg = 4;
|
||||
|
||||
__m256d alphaRv; // for braodcast vector aR (real part of alpha)
|
||||
__m256d alphaIv; // for braodcast vector aI (imaginary part of alpha)
|
||||
__m256d alphaRv; // for broadcast vector aR (real part of alpha)
|
||||
__m256d alphaIv; // for broadcast vector aI (imaginary part of alpha)
|
||||
__m256d xv[7]; // Holds the X vector elements
|
||||
__m256d xShufv[5]; // Holds the permuted X vector elements
|
||||
__m256d yv[7]; // Holds the y vector elements
|
||||
@@ -1258,8 +1259,8 @@ void bli_zaxpyv_zen_int5
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
@@ -460,8 +460,8 @@ void bli_zcopyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
@@ -602,8 +602,8 @@ void bli_zcopyv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -153,8 +153,8 @@ void bli_sdotv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -274,8 +274,8 @@ void bli_ddotv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -222,8 +222,8 @@ void bli_sdotv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
else
|
||||
@@ -434,8 +434,8 @@ void bli_ddotv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
else
|
||||
@@ -711,8 +711,8 @@ void bli_cdotv_zen_int5
|
||||
}
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
else
|
||||
@@ -1000,8 +1000,8 @@ void bli_zdotv_zen_int5
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
else
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -181,8 +181,8 @@ void bli_sdotxv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -311,8 +311,8 @@ void bli_ddotxv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -551,8 +551,8 @@ void bli_zdotxv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -800,8 +800,8 @@ void bli_cdotxv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
|
||||
@@ -791,8 +791,8 @@ void bli_zdscalv_zen_int10
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
@@ -968,8 +968,8 @@ void bli_zscalv_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
/* In double complex data type the computation of
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2018, The University of Texas at Austin
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -539,8 +539,8 @@ void bli_zaxpy2v_zen_int
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// as soon as the n_left cleanup loop below if BLIS is compiled with
|
||||
// transitioning from AVX to SSE instructions (which may occur as soon
|
||||
// as the n_left cleanup loop below if BLIS is compiled with
|
||||
// -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
@@ -718,4 +718,4 @@ void bli_zaxpy2v_zen_int
|
||||
}
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -717,8 +717,8 @@ void bli_zaxpyf_zen_int_4
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
__m128d a_vec[4], y_vec, inter[2];
|
||||
|
||||
@@ -104,7 +104,7 @@ err_t bli_gemm_small
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
|
||||
return BLIS_NOT_YET_IMPLEMENTED;
|
||||
#else
|
||||
// This function is invoked on all architectures including ‘generic’.
|
||||
// This function is invoked on all architectures including 'generic'.
|
||||
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
|
||||
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
|
||||
{
|
||||
@@ -360,8 +360,8 @@ static err_t bli_sgemm_small
|
||||
|
||||
// This is the part of the pack and compute optimization.
|
||||
// During the first column iteration, we store the accessed A matrix into
|
||||
// contiguous static memory. This helps to keep te A matrix in Cache and
|
||||
// aviods the TLB misses.
|
||||
// contiguous static memory. This helps to keep the A matrix in Cache and
|
||||
// avoids the TLB misses.
|
||||
if (required_packing_A)
|
||||
{
|
||||
col_idx = 0;
|
||||
@@ -1748,7 +1748,7 @@ static err_t bli_sgemm_small
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for small gemm."
|
||||
"Invalid dimensions for small gemm."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -1776,7 +1776,7 @@ err_t bli_dgemm_small
|
||||
gint_t K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
|
||||
gint_t L = M * N;
|
||||
|
||||
/* if (N<3) //Implemenation assumes that N is atleast 3. VK */
|
||||
/* if (N<3) //Implementation assumes that N is at least 3. VK */
|
||||
/* { */
|
||||
/* AOCL_DTL_TRACE_EXIT_ERR( */
|
||||
/* AOCL_DTL_LEVEL_INFO, */
|
||||
@@ -1908,8 +1908,8 @@ err_t bli_dgemm_small
|
||||
|
||||
// This is the part of the pack and compute optimization.
|
||||
// During the first column iteration, we store the accessed A matrix into
|
||||
// contiguous static memory. This helps to keep te A matrix in Cache and
|
||||
// aviods the TLB misses.
|
||||
// contiguous static memory. This helps to keep the A matrix in Cache and
|
||||
// avoids the TLB misses.
|
||||
if (required_packing_A)
|
||||
{
|
||||
col_idx = 0;
|
||||
@@ -3339,7 +3339,7 @@ err_t bli_dgemm_small
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for small gemm."
|
||||
"Invalid dimensions for small gemm."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -3816,7 +3816,7 @@ static err_t bli_sgemm_small_atbn
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for small gemm."
|
||||
"Invalid dimensions for small gemm."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -4256,7 +4256,7 @@ static err_t bli_dgemm_small_atbn
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for small gemm."
|
||||
"Invalid dimensions for small gemm."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -4284,7 +4284,7 @@ err_t bli_dgemm_small_At
|
||||
gint_t K = bli_obj_width_after_trans( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
|
||||
|
||||
|
||||
if (N<3) //Implemenation assumes that N is atleast 3.
|
||||
if (N<3) //Implementation assumes that N is at least 3.
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
@@ -5718,7 +5718,7 @@ err_t bli_dgemm_small_At
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for dgemm_small_At."
|
||||
"Invalid dimensions for dgemm_small_At."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -5907,8 +5907,8 @@ err_t bli_zgemm_small
|
||||
/**
|
||||
* This is the part of the pack and compute optimization.
|
||||
* During the first column iteration, we store the accessed A
|
||||
* matrix into contiguous static memory. This helps to keep te A
|
||||
* matrix in Cache and aviods the TLB misses.
|
||||
* matrix into contiguous static memory. This helps to keep the A
|
||||
* matrix in Cache and avoids the TLB misses.
|
||||
*/
|
||||
if (required_packing_A)
|
||||
{
|
||||
@@ -9704,7 +9704,7 @@ err_t bli_zgemm_small
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for small gemm."
|
||||
"Invalid dimensions for small gemm."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
@@ -9733,7 +9733,7 @@ err_t bli_zgemm_small_At
|
||||
gint_t N = bli_obj_width( c ); // number of columns of Matrix C
|
||||
gint_t K = bli_obj_width_after_trans( a ); // number of columns of OP(A)
|
||||
|
||||
if (N<3) //Implemenation assumes that N is atleast 3.
|
||||
if (N<3) //Implementation assumes that N is at least 3.
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
@@ -13406,7 +13406,7 @@ err_t bli_zgemm_small_At
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT_ERR(
|
||||
AOCL_DTL_LEVEL_INFO,
|
||||
"Invalid dimesions for dgemm_small_At."
|
||||
"Invalid dimensions for dgemm_small_At."
|
||||
);
|
||||
return BLIS_NONCONFORMAL_DIMENSIONS;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -627,8 +627,8 @@ void bli_samaxv_zen_int_avx512(
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
/* Store final index to output variable. */
|
||||
@@ -959,8 +959,8 @@ void bli_damaxv_zen_int_avx512(
|
||||
|
||||
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
|
||||
// This avoids a performance penalty caused by false dependencies when
|
||||
// transitioning from from AVX to SSE instructions (which may occur
|
||||
// later, especially if BLIS is compiled with -mfpmath=sse).
|
||||
// transitioning from AVX to SSE instructions (which may occur later,
|
||||
// especially if BLIS is compiled with -mfpmath=sse).
|
||||
_mm256_zeroupper();
|
||||
|
||||
// Return value
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -98,7 +98,7 @@ void blx_gemm_front
|
||||
}
|
||||
|
||||
{
|
||||
// A sort of hack for communicating the desired pach schemas for A and
|
||||
// A sort of hack for communicating the desired pack schemas for A and
|
||||
// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
|
||||
// bli_l3_cntl_create_if()). This allows us to access the schemas from
|
||||
// the control tree, which hopefully reduces some confusion,
|
||||
|
||||
Reference in New Issue
Block a user