Code cleanup: spelling corrections

Corrections for spelling and other mistakes in code comments
and doc files.

AMD-Internal: [CPUPL-2870]
Change-Id: Ifbb5df7df2d6312fe73e06ee6d41c00b16c593ce
This commit is contained in:
Edward Smyth
2023-03-31 11:03:43 -04:00
parent 99d10c3f88
commit 6835205ba8
50 changed files with 173 additions and 166 deletions

View File

@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
}
// Extra space since packing does width in multiples of 16. The bf16
// instruction can be used as long as atleast one zmm register can be fully
// loaded; and since k_dim needs to be atleast 2, having n_dim atleast 16
// instruction can be used as long as at least one zmm register can be fully
// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
// should give 2x16=32 elements, enough for 1 zmm register.The padding is
// not rounded to NR (=64), since that would result in memory wastage.
dim_t n_reorder = make_multiple_of_n( n, 16 );

View File

@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16)
}
// Extra space since packing does width in multiples of 16. The vpmaddubsw
// instruction can be used as long as atleast one ymm register can be fully
// loaded; and since k_dim needs to be at least 2, having n_dim atleast 16
// instruction can be used as long as at least one ymm register can be fully
// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
// should give 2x16=32 elements, enough for 1 ymm register.The padding is
// not rounded to NR (=16), since that would result in memory wastage.
dim_t n_reorder = make_multiple_of_n(n, 16);

View File

@@ -69,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32)
}
// Extra space since packing does width in multiples of 16. The vnni
// instruction can be used as long as atleast one zmm register can be fully
// loaded; and since k_dim needs to be atleast 4, having n_dim atleast 16
// instruction can be used as long as at least one zmm register can be fully
// loaded; and since k_dim needs to be at least 4, having n_dim at least 16
// should give 4x16=64 elements, enough for 1 zmm register.The padding is
// not rounded to NR (=64), since that would result in memory wastage.
dim_t n_reorder = make_multiple_of_n( n, 16 );

View File

@@ -1891,7 +1891,7 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s
### Operation implementation type query
The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implementation query](BLISTypedAPI.md#microkernel-implementation-type-query).
```c
char* bli_info_get_gemm_impl_string( num_t dt );
char* bli_info_get_hemm_impl_string( num_t dt );

View File

@@ -143,7 +143,7 @@ void bli_dgemv_unf_var1
conja = bli_extract_conj(transa);
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -460,7 +460,7 @@ void bli_sgemv_unf_var1
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{

View File

@@ -177,7 +177,7 @@ void bli_dgemv_unf_var2
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -247,7 +247,7 @@ void bli_dgemv_unf_var2
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/* beta=0 case is handled by scalv internally */
bli_dscalv_zen_int10
(
@@ -448,7 +448,7 @@ void bli_sgemv_unf_var2
conja = bli_extract_conj( transa );
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -516,7 +516,7 @@ void bli_sgemv_unf_var2
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/* beta=0 case is handled by scalv internally */
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
@@ -835,7 +835,7 @@ void bli_cgemv_unf_var2
/* If beta is zero, use setv. Otherwise, scale by beta. */
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
/* beta=0 case is handled by scalv internally */
/*bli_cscalv_zen_int10
(
BLIS_NO_CONJUGATE,
@@ -846,7 +846,7 @@ void bli_cgemv_unf_var2
cntx
);*/
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{

View File

@@ -316,7 +316,7 @@ void bli_dhemv_unf_var1
* factor. */
/* Assign kernel function pointer and fusing factor. */
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -312,7 +312,7 @@ void bli_dhemv_unf_var3
PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -295,7 +295,7 @@ void bli_dtrsv_unf_var1
PASTECH(d,dotxf_ker_ft) kfp_df;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_df = bli_ddotxf_zen_int_8;
@@ -496,7 +496,7 @@ void bli_strsv_unf_var1
PASTECH(s,dotxf_ker_ft) kfp_df;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_df = bli_sdotxf_zen_int_8;

View File

@@ -297,7 +297,7 @@ void bli_dtrsv_unf_var2
PASTECH(d,axpyf_ker_ft) kfp_af;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_af = bli_daxpyf_zen_int_16x4;
@@ -496,7 +496,7 @@ void bli_strsv_unf_var2
PASTECH(s, axpyf_ker_ft) kfp_af;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_af = bli_saxpyf_zen_int_5;
@@ -695,7 +695,7 @@ void bli_ztrsv_unf_var2
PASTECH(z, axpyf_ker_ft) kfp_af;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_af = bli_zaxpyf_zen_int_5;
@@ -893,7 +893,7 @@ void bli_ctrsv_unf_var2
PASTECH(c, axpyf_ker_ft) kfp_af;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
kfp_af = bli_caxpyf_zen_int_5;

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -149,7 +149,7 @@ err_t bli_gemm_smart_threading_sup
{
err_t ret_val = BLIS_FAILURE;
// Sanity check, max available threads should be atleast 4 for the
// Sanity check, max available threads should be at least 4 for the
// smart threading/factorization to be meaningful. For nt < 4 the
// default ic,jc factorization holds good.
if ( ( m <= 1 ) || ( n <= 1 ) || ( k <= 1 ) || ( max_available_nt < 4 ) )

View File

@@ -803,7 +803,7 @@ void bli_gemm_md_zgemm
}
{
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -150,7 +151,7 @@ void bli_hemm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -119,7 +120,7 @@ void bli_her2k_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -99,7 +100,7 @@ void bli_herk_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -149,7 +150,7 @@ void bli_symm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -100,7 +101,7 @@ void bli_syr2k_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -94,7 +94,7 @@ void bli_syrk_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -168,7 +168,7 @@ void bli_trmm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -177,7 +177,7 @@ void bli_trmm_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -160,7 +161,7 @@ void bli_trmm3_front
rntm
);
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly

View File

@@ -154,7 +154,7 @@ void bli_trsm_front
// not impact the global cntx object.
cntx_t cntx_trsm = *cntx;
// A sort of hack for communicating the desired pach schemas for A and B
// A sort of hack for communicating the desired pack schemas for A and B
// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion, particularly
@@ -165,11 +165,11 @@ void bli_trsm_front
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
* for TRSM (Till we implement TRSM AVX-512 kernels)
*
* The AVX2 kernels use different block sizes then AVX512 kernels
* Here we override the default block sizes in the context with AVX2
* specific block size used in GEMMTRSM kernerls.
* specific block size used in GEMMTRSM kernerls.
*
* We need to revisit this when TRSM AVX-512 kernels are implemented.
*/
@@ -182,7 +182,7 @@ void bli_trsm_front
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
}
else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT )
{
pack_t schema_a = bli_cntx_schema_a_block( &cntx_trsm );
pack_t schema_b = bli_cntx_schema_b_panel( &cntx_trsm );

View File

@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
* for TRSM (Till we implement TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.

View File

@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
* for TRSM (Till we implement TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.

View File

@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
* for TRSM (Till we implement TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.

View File

@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
/* Zen4 TRSM Fixme:
*
* On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
* for TRSM (Till we implemente TRSM AVX-512 kernels)
* for TRSM (Till we implement TRSM AVX-512 kernels)
*
* The AVX2 kernels for TRSM are enabled in the context, but they
* are compatible with only AVX2 version of GEMM kernels.

View File

@@ -393,7 +393,7 @@ void bli_gks_register_cntx
// At this point, we know the pointer to the array of cntx_t* is NULL and
// needs to be allocated. Allocate the memory and initialize it to
// zeros/NULL, storing the address of the alloacted memory at the element
// zeros/NULL, storing the address of the allocated memory at the element
// for the current architecture id.
gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS );

View File

@@ -167,7 +167,7 @@ f77_int isamax_blis_impl
incx0 = ( inc_t )(*incx);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -499,7 +499,7 @@ void caxpy_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -603,7 +603,7 @@ void zaxpy_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -162,7 +162,7 @@ void scopy_blis_impl
incy0 = (inc_t)(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -267,7 +267,7 @@ void dcopy_blis_impl
incy0 = (inc_t)(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -584,7 +584,7 @@ scomplex cdotu_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -696,7 +696,7 @@ dcomplex zdotu_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -810,7 +810,7 @@ scomplex cdotc_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -922,7 +922,7 @@ dcomplex zdotc_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -512,7 +512,7 @@ void dgemm_blis_impl
const inc_t rs_c = 1;
const inc_t cs_c = *ldc;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -681,7 +681,7 @@ void dgemm_blis_impl
bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
#ifdef AOCL_DYNAMIC
//For smaller sizes dgemm_small is perfoming better
//For smaller sizes dgemm_small is performing better
if (is_parallel && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>150)) )
#else
if (is_parallel)

View File

@@ -283,7 +283,7 @@ void dgemv_blis_impl
rs_a = 1;
cs_a = *lda;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -482,7 +482,7 @@ void sgemv_blis_impl
rs_a = 1;
cs_a = *lda;
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{

View File

@@ -155,7 +155,7 @@ void sswap_blis_impl
incy0 = ( inc_t )(*incy);
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
/* Call BLIS kernel */
@@ -255,7 +255,7 @@ void dswap_blis_impl
}
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
bli_dswapv_zen_int8

View File

@@ -706,7 +706,7 @@ void strsm_blis_impl
bli_obj_set_struc( struca, &ao );
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -1014,7 +1014,7 @@ void dtrsm_blis_impl
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -1449,7 +1449,7 @@ void ztrsm_blis_impl
bli_obj_set_struc( struca, &ao );
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{
@@ -1817,7 +1817,7 @@ void ctrsm_blis_impl
bli_obj_set_struc( struca, &ao );
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == TRUE)
{

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc.
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -255,8 +255,8 @@ void bli_samaxv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
/* Store final index to output variable. */
@@ -743,8 +743,8 @@ static void bli_vec_search_double
/*
Issue vzeroupper instruction to clear upper lanes of ymm registers.
This avoids a performance penalty caused by false dependencies when
transitioning from from AVX to SSE instructions (which may occur
as soon as the n_left cleanup loop below if BLIS is compiled with
transitioning from AVX to SSE instructions (which may occur as soon
as the n_left cleanup loop below if BLIS is compiled with
-mfpmath=sse).
*/
_mm256_zeroupper();

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -150,8 +150,8 @@ void bli_saxpbyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -282,8 +282,8 @@ void bli_daxpbyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -634,8 +634,8 @@ void bli_caxpbyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -1063,8 +1063,8 @@ void bli_zaxpbyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -1146,4 +1146,4 @@ void bli_zaxpbyv_zen_int
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -374,8 +374,8 @@ void bli_saxpbyv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -680,8 +680,8 @@ void bli_daxpbyv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -706,4 +706,4 @@ void bli_daxpbyv_zen_int10
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -138,8 +138,8 @@ void bli_saxpyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -242,8 +242,8 @@ void bli_daxpyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();

View File

@@ -307,9 +307,10 @@ void bli_saxpyv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
for ( ; (i + 0) < n; i += 1 )
@@ -583,8 +584,8 @@ void bli_daxpyv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -638,8 +639,8 @@ void bli_caxpyv_zen_int5
float alphaR, alphaI;
//scomplex alpha => aR + aI i
__m256 alphaRv; // for braodcast vector aR (real part of alpha)
__m256 alphaIv; // for braodcast vector aI (imaginary part of alpha)
__m256 alphaRv; // for broadcast vector aR (real part of alpha)
__m256 alphaIv; // for broadcast vector aI (imaginary part of alpha)
__m256 xv[10];
__m256 xShufv[10];
__m256 yv[10];
@@ -837,8 +838,8 @@ void bli_caxpyv_zen_int5
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -947,8 +948,8 @@ void bli_zaxpyv_zen_int5
{
const dim_t n_elem_per_reg = 4;
__m256d alphaRv; // for braodcast vector aR (real part of alpha)
__m256d alphaIv; // for braodcast vector aI (imaginary part of alpha)
__m256d alphaRv; // for broadcast vector aR (real part of alpha)
__m256d alphaIv; // for broadcast vector aI (imaginary part of alpha)
__m256d xv[7]; // Holds the X vector elements
__m256d xShufv[5]; // Holds the permuted X vector elements
__m256d yv[7]; // Holds the y vector elements
@@ -1258,8 +1259,8 @@ void bli_zaxpyv_zen_int5
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
}

View File

@@ -460,8 +460,8 @@ void bli_zcopyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
}
@@ -602,8 +602,8 @@ void bli_zcopyv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -153,8 +153,8 @@ void bli_sdotv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -274,8 +274,8 @@ void bli_ddotv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -222,8 +222,8 @@ void bli_sdotv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
}
else
@@ -434,8 +434,8 @@ void bli_ddotv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
}
else
@@ -711,8 +711,8 @@ void bli_cdotv_zen_int5
}
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
}
else
@@ -1000,8 +1000,8 @@ void bli_zdotv_zen_int5
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
}
else

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc.
Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
@@ -181,8 +181,8 @@ void bli_sdotxv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -311,8 +311,8 @@ void bli_ddotxv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -551,8 +551,8 @@ void bli_zdotxv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -800,8 +800,8 @@ void bli_cdotxv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();

View File

@@ -791,8 +791,8 @@ void bli_zdscalv_zen_int10
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
}
@@ -968,8 +968,8 @@ void bli_zscalv_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
/* In double complex data type the computation of

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -539,8 +539,8 @@ void bli_zaxpy2v_zen_int
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// as soon as the n_left cleanup loop below if BLIS is compiled with
// transitioning from AVX to SSE instructions (which may occur as soon
// as the n_left cleanup loop below if BLIS is compiled with
// -mfpmath=sse).
_mm256_zeroupper();
@@ -718,4 +718,4 @@ void bli_zaxpy2v_zen_int
}
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
}
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -717,8 +717,8 @@ void bli_zaxpyf_zen_int_4
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
__m128d a_vec[4], y_vec, inter[2];

View File

@@ -104,7 +104,7 @@ err_t bli_gemm_small
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
return BLIS_NOT_YET_IMPLEMENTED;
#else
// This function is invoked on all architectures including generic.
// This function is invoked on all architectures including 'generic'.
// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
if (bli_cpuid_is_avx2fma3_supported() == FALSE)
{
@@ -360,8 +360,8 @@ static err_t bli_sgemm_small
// This is the part of the pack and compute optimization.
// During the first column iteration, we store the accessed A matrix into
// contiguous static memory. This helps to keep te A matrix in Cache and
// aviods the TLB misses.
// contiguous static memory. This helps to keep the A matrix in Cache and
// avoids the TLB misses.
if (required_packing_A)
{
col_idx = 0;
@@ -1748,7 +1748,7 @@ static err_t bli_sgemm_small
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for small gemm."
"Invalid dimensions for small gemm."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -1776,7 +1776,7 @@ err_t bli_dgemm_small
gint_t K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
gint_t L = M * N;
/* if (N<3) //Implemenation assumes that N is atleast 3. VK */
/* if (N<3) //Implementation assumes that N is at least 3. VK */
/* { */
/* AOCL_DTL_TRACE_EXIT_ERR( */
/* AOCL_DTL_LEVEL_INFO, */
@@ -1908,8 +1908,8 @@ err_t bli_dgemm_small
// This is the part of the pack and compute optimization.
// During the first column iteration, we store the accessed A matrix into
// contiguous static memory. This helps to keep te A matrix in Cache and
// aviods the TLB misses.
// contiguous static memory. This helps to keep the A matrix in Cache and
// avoids the TLB misses.
if (required_packing_A)
{
col_idx = 0;
@@ -3339,7 +3339,7 @@ err_t bli_dgemm_small
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for small gemm."
"Invalid dimensions for small gemm."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -3816,7 +3816,7 @@ static err_t bli_sgemm_small_atbn
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for small gemm."
"Invalid dimensions for small gemm."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -4256,7 +4256,7 @@ static err_t bli_dgemm_small_atbn
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for small gemm."
"Invalid dimensions for small gemm."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -4284,7 +4284,7 @@ err_t bli_dgemm_small_At
gint_t K = bli_obj_width_after_trans( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
if (N<3) //Implemenation assumes that N is atleast 3.
if (N<3) //Implementation assumes that N is at least 3.
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
@@ -5718,7 +5718,7 @@ err_t bli_dgemm_small_At
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for dgemm_small_At."
"Invalid dimensions for dgemm_small_At."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -5907,8 +5907,8 @@ err_t bli_zgemm_small
/**
* This is the part of the pack and compute optimization.
* During the first column iteration, we store the accessed A
* matrix into contiguous static memory. This helps to keep te A
* matrix in Cache and aviods the TLB misses.
* matrix into contiguous static memory. This helps to keep the A
* matrix in Cache and avoids the TLB misses.
*/
if (required_packing_A)
{
@@ -9704,7 +9704,7 @@ err_t bli_zgemm_small
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for small gemm."
"Invalid dimensions for small gemm."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}
@@ -9733,7 +9733,7 @@ err_t bli_zgemm_small_At
gint_t N = bli_obj_width( c ); // number of columns of Matrix C
gint_t K = bli_obj_width_after_trans( a ); // number of columns of OP(A)
if (N<3) //Implemenation assumes that N is atleast 3.
if (N<3) //Implementation assumes that N is at least 3.
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
@@ -13406,7 +13406,7 @@ err_t bli_zgemm_small_At
{
AOCL_DTL_TRACE_EXIT_ERR(
AOCL_DTL_LEVEL_INFO,
"Invalid dimesions for dgemm_small_At."
"Invalid dimensions for dgemm_small_At."
);
return BLIS_NONCONFORMAL_DIMENSIONS;
}

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -627,8 +627,8 @@ void bli_samaxv_zen_int_avx512(
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
/* Store final index to output variable. */
@@ -959,8 +959,8 @@ void bli_damaxv_zen_int_avx512(
// Issue vzeroupper instruction to clear upper lanes of ymm registers.
// This avoids a performance penalty caused by false dependencies when
// transitioning from from AVX to SSE instructions (which may occur
// later, especially if BLIS is compiled with -mfpmath=sse).
// transitioning from AVX to SSE instructions (which may occur later,
// especially if BLIS is compiled with -mfpmath=sse).
_mm256_zeroupper();
// Return value

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -98,7 +98,7 @@ void blx_gemm_front
}
{
// A sort of hack for communicating the desired pach schemas for A and
// A sort of hack for communicating the desired pack schemas for A and
// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
// bli_l3_cntl_create_if()). This allows us to access the schemas from
// the control tree, which hopefully reduces some confusion,