Code cleanup: spelling corrections

Corrections for some spelling mistakes in comments.

AMD-Internal: [CPUPL-3519]
Change-Id: I9a82518cde6476bc77fc3861a4b9f8729c6380ba
This commit is contained in:
Edward Smyth
2023-11-01 04:55:30 -04:00
committed by Sireesha Sanga
parent 75356d45e5
commit 9500cbee63
30 changed files with 241 additions and 241 deletions

View File

@@ -93,7 +93,7 @@ void bao_l3_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );

View File

@@ -539,11 +539,11 @@ uint64 AOCL_DTL_get_time_spent(void)
#ifdef AOCL_DTL_AUTO_TRACE_ENABLE
/*
Disable intrumentation for these functions as they will also be
called from compiler generated instumation code to trace
Disable instrumentation for these functions as they will also be
called from compiler generated instrumentation code to trace
function execution.
It needs to be part of declration in the C file so can't be
It needs to be part of declaration in the C file so can't be
moved to header file.
WARNING: These functions are automatically invoked. however any function

View File

@@ -20,18 +20,18 @@
#endif
// BLIS TODO: This is workaround to check if BLIS is built with
// openmp support. Ideally we dont' want any library
// openmp support. Ideally we don't want any library
// specific code in dtl.
#include <blis.h>
#if defined(__linux__)
/*
Disable intrumentation for these functions as they will also be
called from compiler generated instumation code to trace
Disable instrumentation for these functions as they will also be
called from compiler generated instrumentation code to trace
function execution.
It needs to be part of declration in the C file so can't be
It needs to be part of declaration in the C file so can't be
moved to header file.
*/

View File

@@ -1873,7 +1873,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
m_off_cblock += mr_cur; \
} \
\
/* Invoke the gemmsup millikerneli for remaining rectangular part. */ \
/* Invoke the gemmsup millikernel for remaining rectangular part. */ \
gemmsup_ker \
( \
conja, \

View File

@@ -2073,7 +2073,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
m_off_cblock += mr_cur; \
} \
\
/* Invoke the gemmsup millikerneli for remaining rectangular part. */ \
/* Invoke the gemmsup millikernel for remaining rectangular part. */ \
gemmsup_ker \
( \
conja, \

View File

@@ -83,7 +83,7 @@ err_t bli_l3_compute_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
@@ -126,4 +126,4 @@ err_t bli_l3_compute_thread_decorator
return BLIS_SUCCESS;
}
#endif
#endif

View File

@@ -110,7 +110,7 @@ void bli_l3_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the

View File

@@ -93,7 +93,7 @@ err_t bli_l3_sup_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );

View File

@@ -78,7 +78,7 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n,
computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh );
}
// Test body used for exception value testing, by iducing an exception value
// Test body used for exception value testing, by inducing an exception value
// in the index that is passed for each of the matrices.
/*
(ai, aj) is the index with corresponding exception value aexval in matrix A.
@@ -135,4 +135,4 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n,
// check component-wise error.
//----------------------------------------------------------
computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh, true );
}
}

View File

@@ -2532,7 +2532,7 @@ err_t bli_dgemm_small
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
ymm2 = _mm256_loadu_pd(tC + 8);
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);
@@ -2545,7 +2545,7 @@ err_t bli_dgemm_small
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
ymm2 = _mm256_loadu_pd(ttC + 8);
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);
@@ -2558,7 +2558,7 @@ err_t bli_dgemm_small
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
ymm2 = _mm256_loadu_pd(ttC + 8);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
@@ -2566,7 +2566,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC, ymm4);
_mm256_storeu_pd(tC + 4, ymm5);
_mm256_storeu_pd(tC + 8, ymm6);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm7);
tC += ldc;
@@ -2574,7 +2574,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC, ymm8);
_mm256_storeu_pd(tC + 4, ymm9);
_mm256_storeu_pd(tC + 8, ymm10);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm11);
tC += ldc;
@@ -2582,7 +2582,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
_mm256_storeu_pd(tC + 8, ymm14);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
}
n_remainder = N - col_idx;
@@ -2660,7 +2660,7 @@ err_t bli_dgemm_small
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
ymm2 = _mm256_loadu_pd(tC + 8);
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);
@@ -2674,7 +2674,7 @@ err_t bli_dgemm_small
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
ymm2 = _mm256_loadu_pd(ttC + 8);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
@@ -2683,7 +2683,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC + 0, ymm8);
_mm256_storeu_pd(tC + 4, ymm9);
_mm256_storeu_pd(tC + 8, ymm10);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm11);
tC += ldc;
@@ -2691,7 +2691,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
_mm256_storeu_pd(tC + 8, ymm14);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
col_idx += 2;
}
@@ -2755,7 +2755,7 @@ err_t bli_dgemm_small
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
ymm2 = _mm256_loadu_pd(tC + 8);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
@@ -2764,7 +2764,7 @@ err_t bli_dgemm_small
_mm256_storeu_pd(tC + 0, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
_mm256_storeu_pd(tC + 8, ymm14);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
}
}
@@ -2847,7 +2847,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(tC + 4);
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
@@ -2859,7 +2859,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(ttC + 4);
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
@@ -2871,7 +2871,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(ttC + 4);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
@@ -2879,21 +2879,21 @@ err_t bli_dgemm_small
}
_mm256_storeu_pd(tC, ymm4);
_mm256_storeu_pd(tC + 4, ymm5);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm6);
tC += ldc;
_mm256_storeu_pd(tC, ymm8);
_mm256_storeu_pd(tC + 4, ymm9);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm10);
tC += ldc;
_mm256_storeu_pd(tC, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
}
n_remainder = N - col_idx;
@@ -2962,7 +2962,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(tC + 4);
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
@@ -2975,7 +2975,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(ttC + 4);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
@@ -2983,14 +2983,14 @@ err_t bli_dgemm_small
}
_mm256_storeu_pd(tC + 0, ymm8);
_mm256_storeu_pd(tC + 4, ymm9);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm10);
tC += ldc;
_mm256_storeu_pd(tC, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
col_idx += 2;
@@ -3050,7 +3050,7 @@ err_t bli_dgemm_small
ymm2 = _mm256_loadu_pd(tC + 4);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
@@ -3058,7 +3058,7 @@ err_t bli_dgemm_small
}
_mm256_storeu_pd(tC + 0, ymm12);
_mm256_storeu_pd(tC + 4, ymm13);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
}
}
@@ -3135,7 +3135,7 @@ err_t bli_dgemm_small
// multiply C by beta and accumulate.
ymm2 = _mm256_loadu_pd(tC);
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
@@ -3144,7 +3144,7 @@ err_t bli_dgemm_small
double *ttC = tC +ldc;
ymm2 = _mm256_loadu_pd(ttC);
ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
@@ -3153,25 +3153,25 @@ err_t bli_dgemm_small
ttC += ldc;
ymm2 = _mm256_loadu_pd(ttC);
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
}
_mm256_storeu_pd(tC, ymm4);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm5);
tC += ldc;
_mm256_storeu_pd(tC, ymm8);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm9);
tC += ldc;
_mm256_storeu_pd(tC, ymm12);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
}
n_remainder = N - col_idx;
@@ -3231,7 +3231,7 @@ err_t bli_dgemm_small
// multiply C by beta and accumulate.
ymm2 = _mm256_loadu_pd(tC + 0);
ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
@@ -3241,20 +3241,20 @@ err_t bli_dgemm_small
// multiply C by beta and accumulate.
ymm2 = _mm256_loadu_pd(ttC);
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
}
_mm256_storeu_pd(tC + 0, ymm8);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm9);
tC += ldc;
_mm256_storeu_pd(tC, ymm12);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
col_idx += 2;
@@ -3305,13 +3305,13 @@ err_t bli_dgemm_small
// multiply C by beta and accumulate.
ymm2 = _mm256_loadu_pd(tC + 0);
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
}
_mm256_storeu_pd(tC + 0, ymm12);
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
}
}
@@ -3362,34 +3362,34 @@ err_t bli_dgemm_small
if(is_beta_non_zero)
{
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC, maskVec);
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
double* ttC = tC + ldc;
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC, maskVec);
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
ttC += ldc;
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC, maskVec);
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
}
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm4);
tC += ldc;
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm5);
tC += ldc;
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm6);
}
n_remainder = N - col_idx;
@@ -3434,23 +3434,23 @@ err_t bli_dgemm_small
if(is_beta_non_zero)
{
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC, maskVec);
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
double* ttC = tC + ldc;
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(ttC, maskVec);
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
}
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm4);
tC += ldc;
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm5);
col_idx += 2;
@@ -3492,13 +3492,13 @@ err_t bli_dgemm_small
if(is_beta_non_zero)
{
// Masked load the relevant remaider elements of C matrix
// Masked load the relevant remainder elements of C matrix
// Scale by beta.
ymm2 = _mm256_maskload_pd(tC, maskVec);
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
}
// Masked store the relevant remaider elements of C matrix
// Masked store the relevant remainder elements of C matrix
_mm256_maskstore_pd(tC, maskVec, ymm4);
}
}

View File

@@ -151,7 +151,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
__m256i b1 =
_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
// Seperate register for intermediate op
// Separate register for intermediate op
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -168,7 +168,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
@@ -201,7 +201,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -218,7 +218,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -236,7 +236,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -262,7 +262,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -278,7 +278,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -294,7 +294,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -311,7 +311,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -327,7 +327,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -343,7 +343,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -104,7 +104,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -134,7 +134,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -143,7 +143,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -167,7 +167,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -182,7 +182,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -197,7 +197,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -212,7 +212,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -697,7 +697,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -706,7 +706,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -729,7 +729,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -744,7 +744,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -1090,7 +1090,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32)
b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -1113,7 +1113,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);

View File

@@ -88,7 +88,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -101,7 +101,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -114,7 +114,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -127,7 +127,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -148,7 +148,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -161,7 +161,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -174,7 +174,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -187,7 +187,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -513,7 +513,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -526,7 +526,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -539,7 +539,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -552,7 +552,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -573,7 +573,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -586,7 +586,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -599,7 +599,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -612,7 +612,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -969,7 +969,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -982,7 +982,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1002,7 +1002,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1015,7 +1015,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1270,7 +1270,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1283,7 +1283,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 4.
@@ -1303,7 +1303,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1316,7 +1316,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1594,7 +1594,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16)
b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1614,7 +1614,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1836,7 +1836,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1856,7 +1856,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -102,7 +102,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -115,7 +115,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -128,7 +128,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -141,7 +141,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -154,7 +154,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -167,7 +167,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -188,7 +188,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -201,7 +201,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -227,7 +227,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -240,7 +240,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -253,7 +253,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -714,7 +714,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -727,7 +727,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -740,7 +740,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -753,7 +753,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -766,7 +766,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -812,7 +812,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -825,7 +825,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -838,7 +838,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -851,7 +851,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -864,7 +864,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
//convert signed int8 to uint8 for u8s8s16 FMA ops
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -144,7 +144,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
__m256i b1 =
_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
// Seperate register for intermediate op
// Separate register for intermediate op
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -158,7 +158,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_int32_0 =
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -172,7 +172,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_int32_0 =
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
@@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_int32_0 =
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -199,7 +199,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_int32_0 =
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -214,7 +214,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_int32_0 =
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -237,7 +237,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
uint8_t a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
__m256i a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -250,7 +250,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -263,7 +263,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -277,7 +277,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -290,7 +290,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -303,7 +303,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -95,7 +95,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
// Broadcast a[1,kr:kr+2].
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -107,7 +107,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
// Broadcast a[2,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
// Broadcast a[3,kr:kr+2].
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -128,7 +128,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -149,7 +149,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -161,7 +161,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_1 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -173,7 +173,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -185,7 +185,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_1 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -687,7 +687,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
// Broadcast a[1,kr:kr+2].
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -696,7 +696,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -716,7 +716,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -728,7 +728,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_1 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
@@ -1080,7 +1080,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
@@ -1100,7 +1100,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);

View File

@@ -82,7 +82,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -92,7 +92,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -102,7 +102,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
// Broadcast a[2,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -112,7 +112,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
// Broadcast a[3,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -130,7 +130,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -140,7 +140,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -150,7 +150,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -160,7 +160,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -497,7 +497,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -507,7 +507,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -517,7 +517,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
// Broadcast a[2,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -527,7 +527,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
// Broadcast a[3,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -545,7 +545,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -555,7 +555,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -565,7 +565,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -575,7 +575,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -957,7 +957,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -967,7 +967,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -984,7 +984,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -994,7 +994,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1253,7 +1253,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1263,7 +1263,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 4.
@@ -1280,7 +1280,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1290,7 +1290,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1582,7 +1582,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1599,7 +1599,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1824,7 +1824,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -1841,7 +1841,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -96,7 +96,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -106,7 +106,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -116,7 +116,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[2,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -126,7 +126,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[3,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -136,7 +136,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[4,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -146,7 +146,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
// Broadcast a[5,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -164,7 +164,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -174,7 +174,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -184,7 +184,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -194,7 +194,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -204,7 +204,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -692,7 +692,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[0,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -702,7 +702,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[1,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -712,7 +712,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[2,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -722,7 +722,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[3,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -732,7 +732,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[4,kr:kr+2].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -742,7 +742,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
// Broadcast a[5,kr:kr+4].
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -769,7 +769,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -789,7 +789,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -799,7 +799,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.
@@ -809,7 +809,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
a_int32_0 = _mm256_set1_epi8(a_kfringe);
// Seperate register for intermediate op
// Separate register for intermediate op
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
// Perform column direction mat-mul with k = 2.

View File

@@ -278,7 +278,7 @@ void bli_samaxv_zen_int_avx512(
mask.v = _mm512_sub_ps(max_vec_1.v, x_vec_1.v);
// Type cast mask from IEEE754 (float) to integer type
// This operation will not need a new register, its just to convince
// the compiler. But its accounted as seperate register in the
// the compiler. But its accounted as separate register in the
// above calculations
intMask = _mm512_castps_si512(mask.v);
// Extract the signbit and build the mask.
@@ -312,7 +312,7 @@ void bli_samaxv_zen_int_avx512(
mask.v = _mm512_sub_ps(max_vec_2.v, x_vec_2.v);
// Type cast mask from IEEE754 (float) to integer type
// This operation will not need a new register, its just to convince
// the compiler. But its accounted as seperate register in the
// the compiler. But its accounted as separate register in the
// above calculations
intMask = _mm512_castps_si512(mask.v);
// Extract the signbit and build the mask.
@@ -345,7 +345,7 @@ void bli_samaxv_zen_int_avx512(
mask.v = _mm512_sub_ps(max_vec_3.v, x_vec_3.v);
// Type cast mask from IEEE754 (float) to integer type
// This operation will not need a new register, its just to convince
// the compiler. But its accounted as seperate register in the
// the compiler. But its accounted as separate register in the
// above calculations
intMask = _mm512_castps_si512(mask.v);
// Extract the signbit and build the mask.
@@ -397,7 +397,7 @@ void bli_samaxv_zen_int_avx512(
mask.v = _mm512_sub_ps(max_vec_2.v, max_vec_3.v);
// Type cast mask from IEEE754 (float) to integer type
// This operation will not need a new register, its just to convince
// the compiler. But its accounted as seperate register in the
// the compiler. But its accounted as separate register in the
// above calculations
intMask = _mm512_castps_si512(mask.v);
// Extract the signbit and build the mask.
@@ -423,7 +423,7 @@ void bli_samaxv_zen_int_avx512(
mask.v = _mm512_sub_ps(max_vec_1.v, max_vec_2.v);
// Type cast mask from IEEE754 (float) to integer type
// This operation will not need a new register, its just to convince
// the compiler. But its accounted as seperate register in the
// the compiler. But its accounted as separate register in the
// above calculations
intMask = _mm512_castps_si512(mask.v);
// Extract the signbit and build the mask.

View File

@@ -218,7 +218,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
/*
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
* number of regusters used for broadcast A = 2 zmm6 and zmm7
* number of registers used for broadcast A = 2 zmm6 and zmm7
*/
void bli_dgemm_zen4_asm_8x24(
dim_t k_,

View File

@@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
/*
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
* number of regusters used for broadcast A = 2 zmm6 and zmm7
* number of registers used for broadcast A = 2 zmm6 and zmm7
*/
void bli_dgemmtrsm_l_zen4_asm_8x24
(

View File

@@ -411,7 +411,7 @@ void bli_dgemmtrsm_l_zen_asm_16x14
/*
C prefetch Loop
Note: This loop runs 14 times,
These 14 iterations are done seperately so that c11 can be prefetched here.
These 14 iterations are done separately so that c11 can be prefetched here.
*/
ADD(R11, RSI)
ADD(IMM(14), RSI)

View File

@@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
/*
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
* number of regusters used for broadcast A = 2 zmm6 and zmm7
* number of registers used for broadcast A = 2 zmm6 and zmm7
*/
void bli_dgemmtrsm_u_zen4_asm_8x24
(

View File

@@ -407,7 +407,7 @@ void bli_dgemmtrsm_u_zen_asm_16x14
/*
C prefetch Loop
Note: This loop runs 14 times,
These 14 iterations are done seperately so that c11 can be prefetched here.
These 14 iterations are done separately so that c11 can be prefetched here.
*/
ADD(R11, RSI)
ADD(IMM(14), RSI)

View File

@@ -729,7 +729,7 @@ err_t bli_trsm_small_mt_AVX512
// region - GEMM DTRSM for right variants
#define BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
/*K loop is broken into two seperate loops
/*K loop is broken into two separate loops
each loop computes k/2 iterations */ \
\
int itr = (k_iter / 2); /*itr count for first loop*/\
@@ -900,7 +900,7 @@ err_t bli_trsm_small_mt_AVX512
*/
#define BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
/*K loop is broken into two seperate loops
/*K loop is broken into two separate loops
each loop computes k/2 iterations */ \
\
int itr = (k_iter / 2); /*itr count for first loop*/\
@@ -979,7 +979,7 @@ err_t bli_trsm_small_mt_AVX512
#define BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
/*K loop is broken into two seperate loops
/*K loop is broken into two separate loops
each loop computes k/2 iterations */ \
\
int itr = (k_iter / 2); /*itr count for first loop*/\
@@ -1062,7 +1062,7 @@ err_t bli_trsm_small_mt_AVX512
ymm16 = _mm256_add_pd(ymm16, ymm31);
#define BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
/*K loop is broken into two seperate loops
/*K loop is broken into two separate loops
each loop computes k/2 iterations */ \
\
int itr = (k_iter / 2); /*itr count for first loop*/\
@@ -1142,7 +1142,7 @@ err_t bli_trsm_small_mt_AVX512
ymm16 = _mm256_add_pd(ymm16, ymm31);
#define BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
/*K loop is broken into two seperate loops
/*K loop is broken into two separate loops
each loop computes k/2 iterations */ \
\
int itr = (k_iter / 2); /*itr count for first loop*/\

View File

@@ -916,7 +916,7 @@ void bli_zgemmsup_cv_zen4_asm_12x4m
const double *v = &value;
// Assigning the type of alpha and beta scaling
// In order to facilitate handling special cases seperately
// In order to facilitate handling special cases separately
char alpha_mul_type = BLIS_MUL_DEFAULT;
char beta_mul_type = BLIS_MUL_DEFAULT;
@@ -1400,7 +1400,7 @@ void bli_zgemmsup_cv_zen4_asm_12x3m
const double *v = &value;
// Assigning the type of alpha and beta scaling
// In order to facilitate handling special cases seperately
// In order to facilitate handling special cases separately
char alpha_mul_type = BLIS_MUL_DEFAULT;
char beta_mul_type = BLIS_MUL_DEFAULT;
@@ -1819,7 +1819,7 @@ void bli_zgemmsup_cv_zen4_asm_12x2m
const double *v = &value;
// Assigning the type of alpha and beta scaling
// In order to facilitate handling special cases seperately
// In order to facilitate handling special cases separately
char alpha_mul_type = BLIS_MUL_DEFAULT;
char beta_mul_type = BLIS_MUL_DEFAULT;
@@ -2224,7 +2224,7 @@ void bli_zgemmsup_cv_zen4_asm_12x1m
*/
// Assigning the type of alpha and beta scaling
// In order to facilitate handling special cases seperately
// In order to facilitate handling special cases separately
char alpha_mul_type = BLIS_MUL_DEFAULT;
char beta_mul_type = BLIS_MUL_DEFAULT;

View File

@@ -92,7 +92,7 @@ void bls_l3_thread_decorator
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
// NOTE: This calls the same function used for the conventional/large
// code path.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );

View File

@@ -164,7 +164,7 @@ void blx_gemm_thread
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
// Check for a somewhat obscure OpenMP thread-mismatch issue.
//bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the

View File

@@ -160,7 +160,7 @@ TEST_OPS := dotv axpyv \
gemm hemm herk her2k trmm trsm \
# Include extension API's added by AMD in operations list
# Keeping it seperate in case it needs to be guarded by a variable
# Keeping it separate in case it needs to be guarded by a variable
TEST_OPS := $(TEST_OPS) axpbyv cabs1 copyv gemm3m gemm_batch \
gemmt imatcopy omatadd omatcopy omatcopy2 \

View File

@@ -45,7 +45,7 @@
/* Format for FILE input
* For each input set, first line contains 'storage scheme'
* and 'group count' seperated by space.
* and 'group count' separated by space.
* Following 'group_count' number of lines contains all the parameters of
* each group separated by space in each line in the following order:
* tA tB m n k lda ldb ldc alpha_r alpha_i beta_r beta_i group_size