mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Code cleanup: spelling corrections
Corrections for some spelling mistakes in comments. AMD-Internal: [CPUPL-3519] Change-Id: I9a82518cde6476bc77fc3861a4b9f8729c6380ba
This commit is contained in:
committed by
Sireesha Sanga
parent
75356d45e5
commit
9500cbee63
@@ -93,7 +93,7 @@ void bao_l3_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
@@ -539,11 +539,11 @@ uint64 AOCL_DTL_get_time_spent(void)
|
||||
#ifdef AOCL_DTL_AUTO_TRACE_ENABLE
|
||||
|
||||
/*
|
||||
Disable intrumentation for these functions as they will also be
|
||||
called from compiler generated instumation code to trace
|
||||
Disable instrumentation for these functions as they will also be
|
||||
called from compiler generated instrumentation code to trace
|
||||
function execution.
|
||||
|
||||
It needs to be part of declration in the C file so can't be
|
||||
It needs to be part of declaration in the C file so can't be
|
||||
moved to header file.
|
||||
|
||||
WARNING: These functions are automatically invoked. however any function
|
||||
|
||||
@@ -20,18 +20,18 @@
|
||||
#endif
|
||||
|
||||
// BLIS TODO: This is workaround to check if BLIS is built with
|
||||
// openmp support. Ideally we dont' want any library
|
||||
// openmp support. Ideally we don't want any library
|
||||
// specific code in dtl.
|
||||
#include <blis.h>
|
||||
|
||||
#if defined(__linux__)
|
||||
|
||||
/*
|
||||
Disable intrumentation for these functions as they will also be
|
||||
called from compiler generated instumation code to trace
|
||||
Disable instrumentation for these functions as they will also be
|
||||
called from compiler generated instrumentation code to trace
|
||||
function execution.
|
||||
|
||||
It needs to be part of declration in the C file so can't be
|
||||
It needs to be part of declaration in the C file so can't be
|
||||
moved to header file.
|
||||
|
||||
*/
|
||||
|
||||
@@ -1873,7 +1873,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
m_off_cblock += mr_cur; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemmsup millikerneli for remaining rectangular part. */ \
|
||||
/* Invoke the gemmsup millikernel for remaining rectangular part. */ \
|
||||
gemmsup_ker \
|
||||
( \
|
||||
conja, \
|
||||
|
||||
@@ -2073,7 +2073,7 @@ void PASTEMACT(ch,opname,uplo,varname) \
|
||||
m_off_cblock += mr_cur; \
|
||||
} \
|
||||
\
|
||||
/* Invoke the gemmsup millikerneli for remaining rectangular part. */ \
|
||||
/* Invoke the gemmsup millikernel for remaining rectangular part. */ \
|
||||
gemmsup_ker \
|
||||
( \
|
||||
conja, \
|
||||
|
||||
@@ -83,7 +83,7 @@ err_t bli_l3_compute_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
@@ -126,4 +126,4 @@ err_t bli_l3_compute_thread_decorator
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -110,7 +110,7 @@ void bli_l3_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
|
||||
@@ -93,7 +93,7 @@ err_t bli_l3_sup_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
@@ -78,7 +78,7 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n,
|
||||
computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh );
|
||||
}
|
||||
|
||||
// Test body used for exception value testing, by iducing an exception value
|
||||
// Test body used for exception value testing, by inducing an exception value
|
||||
// in the index that is passed for each of the matrices.
|
||||
/*
|
||||
(ai, aj) is the index with corresponding exception value aexval in matrix A.
|
||||
@@ -135,4 +135,4 @@ void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n,
|
||||
// check component-wise error.
|
||||
//----------------------------------------------------------
|
||||
computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh, true );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2532,7 +2532,7 @@ err_t bli_dgemm_small
|
||||
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
|
||||
ymm2 = _mm256_loadu_pd(tC + 8);
|
||||
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
|
||||
ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);
|
||||
@@ -2545,7 +2545,7 @@ err_t bli_dgemm_small
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
ymm2 = _mm256_loadu_pd(ttC + 8);
|
||||
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
|
||||
ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);
|
||||
@@ -2558,7 +2558,7 @@ err_t bli_dgemm_small
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
ymm2 = _mm256_loadu_pd(ttC + 8);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
|
||||
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
|
||||
@@ -2566,7 +2566,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC, ymm4);
|
||||
_mm256_storeu_pd(tC + 4, ymm5);
|
||||
_mm256_storeu_pd(tC + 8, ymm6);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm7);
|
||||
|
||||
tC += ldc;
|
||||
@@ -2574,7 +2574,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC, ymm8);
|
||||
_mm256_storeu_pd(tC + 4, ymm9);
|
||||
_mm256_storeu_pd(tC + 8, ymm10);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm11);
|
||||
|
||||
tC += ldc;
|
||||
@@ -2582,7 +2582,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
_mm256_storeu_pd(tC + 8, ymm14);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
|
||||
}
|
||||
n_remainder = N - col_idx;
|
||||
@@ -2660,7 +2660,7 @@ err_t bli_dgemm_small
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
ymm2 = _mm256_loadu_pd(tC + 8);
|
||||
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
|
||||
ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);
|
||||
@@ -2674,7 +2674,7 @@ err_t bli_dgemm_small
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
ymm2 = _mm256_loadu_pd(ttC + 8);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 12, maskVec);
|
||||
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
|
||||
@@ -2683,7 +2683,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC + 0, ymm8);
|
||||
_mm256_storeu_pd(tC + 4, ymm9);
|
||||
_mm256_storeu_pd(tC + 8, ymm10);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm11);
|
||||
|
||||
tC += ldc;
|
||||
@@ -2691,7 +2691,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
_mm256_storeu_pd(tC + 8, ymm14);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
|
||||
col_idx += 2;
|
||||
}
|
||||
@@ -2755,7 +2755,7 @@ err_t bli_dgemm_small
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
ymm2 = _mm256_loadu_pd(tC + 8);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 12, maskVec);
|
||||
ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);
|
||||
@@ -2764,7 +2764,7 @@ err_t bli_dgemm_small
|
||||
_mm256_storeu_pd(tC + 0, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
_mm256_storeu_pd(tC + 8, ymm14);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 12, maskVec, ymm15);
|
||||
}
|
||||
}
|
||||
@@ -2847,7 +2847,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(tC + 4);
|
||||
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
|
||||
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
|
||||
@@ -2859,7 +2859,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(ttC + 4);
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
|
||||
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
|
||||
@@ -2871,7 +2871,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(ttC + 4);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
@@ -2879,21 +2879,21 @@ err_t bli_dgemm_small
|
||||
}
|
||||
_mm256_storeu_pd(tC, ymm4);
|
||||
_mm256_storeu_pd(tC + 4, ymm5);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm6);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm8);
|
||||
_mm256_storeu_pd(tC + 4, ymm9);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm10);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
|
||||
}
|
||||
n_remainder = N - col_idx;
|
||||
@@ -2962,7 +2962,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(tC + 4);
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
|
||||
ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);
|
||||
@@ -2975,7 +2975,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(ttC + 4);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 8, maskVec);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
@@ -2983,14 +2983,14 @@ err_t bli_dgemm_small
|
||||
}
|
||||
_mm256_storeu_pd(tC + 0, ymm8);
|
||||
_mm256_storeu_pd(tC + 4, ymm9);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm10);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
|
||||
|
||||
col_idx += 2;
|
||||
@@ -3050,7 +3050,7 @@ err_t bli_dgemm_small
|
||||
|
||||
ymm2 = _mm256_loadu_pd(tC + 4);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 8, maskVec);
|
||||
ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);
|
||||
@@ -3058,7 +3058,7 @@ err_t bli_dgemm_small
|
||||
}
|
||||
_mm256_storeu_pd(tC + 0, ymm12);
|
||||
_mm256_storeu_pd(tC + 4, ymm13);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 8, maskVec, ymm14);
|
||||
}
|
||||
}
|
||||
@@ -3135,7 +3135,7 @@ err_t bli_dgemm_small
|
||||
// multiply C by beta and accumulate.
|
||||
ymm2 = _mm256_loadu_pd(tC);
|
||||
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
|
||||
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
|
||||
@@ -3144,7 +3144,7 @@ err_t bli_dgemm_small
|
||||
double *ttC = tC +ldc;
|
||||
ymm2 = _mm256_loadu_pd(ttC);
|
||||
ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
@@ -3153,25 +3153,25 @@ err_t bli_dgemm_small
|
||||
ttC += ldc;
|
||||
ymm2 = _mm256_loadu_pd(ttC);
|
||||
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
}
|
||||
_mm256_storeu_pd(tC, ymm4);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm5);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm8);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm9);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
|
||||
}
|
||||
n_remainder = N - col_idx;
|
||||
@@ -3231,7 +3231,7 @@ err_t bli_dgemm_small
|
||||
// multiply C by beta and accumulate.
|
||||
ymm2 = _mm256_loadu_pd(tC + 0);
|
||||
ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
|
||||
ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);
|
||||
@@ -3241,20 +3241,20 @@ err_t bli_dgemm_small
|
||||
// multiply C by beta and accumulate.
|
||||
ymm2 = _mm256_loadu_pd(ttC);
|
||||
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC + 4, maskVec);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
|
||||
}
|
||||
_mm256_storeu_pd(tC + 0, ymm8);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm9);
|
||||
|
||||
tC += ldc;
|
||||
|
||||
_mm256_storeu_pd(tC, ymm12);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
|
||||
|
||||
col_idx += 2;
|
||||
@@ -3305,13 +3305,13 @@ err_t bli_dgemm_small
|
||||
// multiply C by beta and accumulate.
|
||||
ymm2 = _mm256_loadu_pd(tC + 0);
|
||||
ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12);
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC + 4, maskVec);
|
||||
ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13);
|
||||
}
|
||||
_mm256_storeu_pd(tC + 0, ymm12);
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC + 4, maskVec, ymm13);
|
||||
}
|
||||
}
|
||||
@@ -3362,34 +3362,34 @@ err_t bli_dgemm_small
|
||||
|
||||
if(is_beta_non_zero)
|
||||
{
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC, maskVec);
|
||||
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
|
||||
|
||||
double* ttC = tC + ldc;
|
||||
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC, maskVec);
|
||||
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
|
||||
|
||||
ttC += ldc;
|
||||
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC, maskVec);
|
||||
ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);
|
||||
}
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm4);
|
||||
|
||||
tC += ldc;
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm5);
|
||||
|
||||
tC += ldc;
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm6);
|
||||
}
|
||||
n_remainder = N - col_idx;
|
||||
@@ -3434,23 +3434,23 @@ err_t bli_dgemm_small
|
||||
|
||||
if(is_beta_non_zero)
|
||||
{
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC, maskVec);
|
||||
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
|
||||
|
||||
double* ttC = tC + ldc;
|
||||
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(ttC, maskVec);
|
||||
ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);
|
||||
}
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm4);
|
||||
|
||||
tC += ldc;
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm5);
|
||||
|
||||
col_idx += 2;
|
||||
@@ -3492,13 +3492,13 @@ err_t bli_dgemm_small
|
||||
|
||||
if(is_beta_non_zero)
|
||||
{
|
||||
// Masked load the relevant remaider elements of C matrix
|
||||
// Masked load the relevant remainder elements of C matrix
|
||||
// Scale by beta.
|
||||
ymm2 = _mm256_maskload_pd(tC, maskVec);
|
||||
ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);
|
||||
|
||||
}
|
||||
// Masked store the relevant remaider elements of C matrix
|
||||
// Masked store the relevant remainder elements of C matrix
|
||||
_mm256_maskstore_pd(tC, maskVec, ymm4);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
__m256i b1 =
|
||||
_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -168,7 +168,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
|
||||
@@ -201,7 +201,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -218,7 +218,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -236,7 +236,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -262,7 +262,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -278,7 +278,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -294,7 +294,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -311,7 +311,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -327,7 +327,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -343,7 +343,7 @@ LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -104,7 +104,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -134,7 +134,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -143,7 +143,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
|
||||
c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -167,7 +167,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -182,7 +182,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -197,7 +197,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -212,7 +212,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -697,7 +697,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -706,7 +706,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
|
||||
c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
|
||||
c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -729,7 +729,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -744,7 +744,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -1090,7 +1090,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32)
|
||||
b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
|
||||
b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -1113,7 +1113,7 @@ LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -101,7 +101,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -114,7 +114,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -127,7 +127,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -148,7 +148,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -161,7 +161,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -174,7 +174,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -187,7 +187,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -513,7 +513,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -526,7 +526,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -539,7 +539,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -552,7 +552,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -573,7 +573,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -586,7 +586,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -599,7 +599,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -612,7 +612,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -969,7 +969,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -982,7 +982,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1002,7 +1002,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1015,7 +1015,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1270,7 +1270,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1283,7 +1283,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
@@ -1303,7 +1303,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1316,7 +1316,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1594,7 +1594,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16)
|
||||
|
||||
b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1614,7 +1614,7 @@ LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1836,7 +1836,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1856,7 +1856,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -102,7 +102,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -115,7 +115,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -128,7 +128,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -141,7 +141,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -154,7 +154,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -167,7 +167,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -188,7 +188,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -201,7 +201,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -227,7 +227,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -240,7 +240,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -253,7 +253,7 @@ LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -714,7 +714,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -727,7 +727,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -740,7 +740,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -753,7 +753,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -766,7 +766,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -812,7 +812,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -825,7 +825,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -838,7 +838,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -851,7 +851,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -864,7 +864,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
|
||||
//convert signed int8 to uint8 for u8s8s16 FMA ops
|
||||
a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -144,7 +144,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
__m256i b1 =
|
||||
_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -158,7 +158,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_int32_0 =
|
||||
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -172,7 +172,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_int32_0 =
|
||||
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
|
||||
@@ -185,7 +185,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_int32_0 =
|
||||
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -199,7 +199,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_int32_0 =
|
||||
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -214,7 +214,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_int32_0 =
|
||||
_mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -237,7 +237,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
uint8_t a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
__m256i a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -250,7 +250,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -263,7 +263,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -277,7 +277,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -290,7 +290,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -303,7 +303,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
|
||||
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -95,7 +95,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -107,7 +107,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
// Broadcast a[2,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -119,7 +119,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
// Broadcast a[3,kr:kr+2].
|
||||
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -128,7 +128,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
|
||||
c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -149,7 +149,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -161,7 +161,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_1 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -173,7 +173,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -185,7 +185,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_1 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -687,7 +687,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_1 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -696,7 +696,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
|
||||
c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
|
||||
c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -716,7 +716,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -728,7 +728,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_1 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
|
||||
inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
|
||||
|
||||
@@ -1080,7 +1080,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
|
||||
b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
|
||||
b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
@@ -1100,7 +1100,7 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -92,7 +92,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -102,7 +102,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
// Broadcast a[2,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -112,7 +112,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
// Broadcast a[3,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -130,7 +130,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -140,7 +140,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -150,7 +150,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -160,7 +160,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -497,7 +497,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -507,7 +507,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -517,7 +517,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
// Broadcast a[2,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -527,7 +527,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
// Broadcast a[3,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -545,7 +545,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -555,7 +555,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -565,7 +565,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -575,7 +575,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -957,7 +957,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -967,7 +967,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -984,7 +984,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -994,7 +994,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1253,7 +1253,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1263,7 +1263,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 4.
|
||||
@@ -1280,7 +1280,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1290,7 +1290,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1582,7 +1582,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
|
||||
|
||||
b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1599,7 +1599,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1824,7 +1824,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -1841,7 +1841,7 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -96,7 +96,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -106,7 +106,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -116,7 +116,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[2,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -126,7 +126,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[3,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -136,7 +136,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[4,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -146,7 +146,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
// Broadcast a[5,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -164,7 +164,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -174,7 +174,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -184,7 +184,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -194,7 +194,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -204,7 +204,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -214,7 +214,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
|
||||
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -692,7 +692,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[0,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 0) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -702,7 +702,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[1,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 1) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -712,7 +712,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[2,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 2) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -722,7 +722,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[3,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 3) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -732,7 +732,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[4,kr:kr+2].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 4) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -742,7 +742,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
// Broadcast a[5,kr:kr+4].
|
||||
a_int32_0 = _mm256_set1_epi16(*(uint16_t *)(a + (rs_a * 5) + (cs_a * offset)));
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -769,7 +769,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -779,7 +779,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -789,7 +789,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -799,7 +799,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
@@ -809,7 +809,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
|
||||
a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
|
||||
a_int32_0 = _mm256_set1_epi8(a_kfringe);
|
||||
|
||||
// Seperate register for intermediate op
|
||||
// Separate register for intermediate op
|
||||
inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
|
||||
|
||||
// Perform column direction mat-mul with k = 2.
|
||||
|
||||
@@ -278,7 +278,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
mask.v = _mm512_sub_ps(max_vec_1.v, x_vec_1.v);
|
||||
// Type cast mask from IEEE754 (float) to integer type
|
||||
// This operation will not need a new register, its just to convince
|
||||
// the compiler. But its accounted as seperate register in the
|
||||
// the compiler. But its accounted as separate register in the
|
||||
// above calculations
|
||||
intMask = _mm512_castps_si512(mask.v);
|
||||
// Extract the signbit and build the mask.
|
||||
@@ -312,7 +312,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
mask.v = _mm512_sub_ps(max_vec_2.v, x_vec_2.v);
|
||||
// Type cast mask from IEEE754 (float) to integer type
|
||||
// This operation will not need a new register, its just to convince
|
||||
// the compiler. But its accounted as seperate register in the
|
||||
// the compiler. But its accounted as separate register in the
|
||||
// above calculations
|
||||
intMask = _mm512_castps_si512(mask.v);
|
||||
// Extract the signbit and build the mask.
|
||||
@@ -345,7 +345,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
mask.v = _mm512_sub_ps(max_vec_3.v, x_vec_3.v);
|
||||
// Type cast mask from IEEE754 (float) to integer type
|
||||
// This operation will not need a new register, its just to convince
|
||||
// the compiler. But its accounted as seperate register in the
|
||||
// the compiler. But its accounted as separate register in the
|
||||
// above calculations
|
||||
intMask = _mm512_castps_si512(mask.v);
|
||||
// Extract the signbit and build the mask.
|
||||
@@ -397,7 +397,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
mask.v = _mm512_sub_ps(max_vec_2.v, max_vec_3.v);
|
||||
// Type cast mask from IEEE754 (float) to integer type
|
||||
// This operation will not need a new register, its just to convince
|
||||
// the compiler. But its accounted as seperate register in the
|
||||
// the compiler. But its accounted as separate register in the
|
||||
// above calculations
|
||||
intMask = _mm512_castps_si512(mask.v);
|
||||
// Extract the signbit and build the mask.
|
||||
@@ -423,7 +423,7 @@ void bli_samaxv_zen_int_avx512(
|
||||
mask.v = _mm512_sub_ps(max_vec_1.v, max_vec_2.v);
|
||||
// Type cast mask from IEEE754 (float) to integer type
|
||||
// This operation will not need a new register, its just to convince
|
||||
// the compiler. But its accounted as seperate register in the
|
||||
// the compiler. But its accounted as separate register in the
|
||||
// above calculations
|
||||
intMask = _mm512_castps_si512(mask.v);
|
||||
// Extract the signbit and build the mask.
|
||||
|
||||
@@ -218,7 +218,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
|
||||
/*
|
||||
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
|
||||
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
|
||||
* number of regusters used for broadcast A = 2 zmm6 and zmm7
|
||||
* number of registers used for broadcast A = 2 zmm6 and zmm7
|
||||
*/
|
||||
void bli_dgemm_zen4_asm_8x24(
|
||||
dim_t k_,
|
||||
|
||||
@@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
|
||||
/*
|
||||
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
|
||||
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
|
||||
* number of regusters used for broadcast A = 2 zmm6 and zmm7
|
||||
* number of registers used for broadcast A = 2 zmm6 and zmm7
|
||||
*/
|
||||
void bli_dgemmtrsm_l_zen4_asm_8x24
|
||||
(
|
||||
|
||||
@@ -411,7 +411,7 @@ void bli_dgemmtrsm_l_zen_asm_16x14
|
||||
/*
|
||||
C prefetch Loop
|
||||
Note: This loop runs 14 times,
|
||||
These 14 iterations are done seperately so that c11 can be prefetched here.
|
||||
These 14 iterations are done separately so that c11 can be prefetched here.
|
||||
*/
|
||||
ADD(R11, RSI)
|
||||
ADD(IMM(14), RSI)
|
||||
|
||||
@@ -156,7 +156,7 @@ static int64_t offsets[24] __attribute__((aligned(64))) =
|
||||
/*
|
||||
* number of accumulation registers = 24/8 * 8 = 24 zmm8 to zmm31
|
||||
* number of registers used for load B = 24/8 = 3 zmm0 to zmm2
|
||||
* number of regusters used for broadcast A = 2 zmm6 and zmm7
|
||||
* number of registers used for broadcast A = 2 zmm6 and zmm7
|
||||
*/
|
||||
void bli_dgemmtrsm_u_zen4_asm_8x24
|
||||
(
|
||||
|
||||
@@ -407,7 +407,7 @@ void bli_dgemmtrsm_u_zen_asm_16x14
|
||||
/*
|
||||
C prefetch Loop
|
||||
Note: This loop runs 14 times,
|
||||
These 14 iterations are done seperately so that c11 can be prefetched here.
|
||||
These 14 iterations are done separately so that c11 can be prefetched here.
|
||||
*/
|
||||
ADD(R11, RSI)
|
||||
ADD(IMM(14), RSI)
|
||||
|
||||
@@ -729,7 +729,7 @@ err_t bli_trsm_small_mt_AVX512
|
||||
// region - GEMM DTRSM for right variants
|
||||
|
||||
#define BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
|
||||
/*K loop is broken into two seperate loops
|
||||
/*K loop is broken into two separate loops
|
||||
each loop computes k/2 iterations */ \
|
||||
\
|
||||
int itr = (k_iter / 2); /*itr count for first loop*/\
|
||||
@@ -900,7 +900,7 @@ err_t bli_trsm_small_mt_AVX512
|
||||
*/
|
||||
|
||||
#define BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
|
||||
/*K loop is broken into two seperate loops
|
||||
/*K loop is broken into two separate loops
|
||||
each loop computes k/2 iterations */ \
|
||||
\
|
||||
int itr = (k_iter / 2); /*itr count for first loop*/\
|
||||
@@ -979,7 +979,7 @@ err_t bli_trsm_small_mt_AVX512
|
||||
|
||||
|
||||
#define BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
|
||||
/*K loop is broken into two seperate loops
|
||||
/*K loop is broken into two separate loops
|
||||
each loop computes k/2 iterations */ \
|
||||
\
|
||||
int itr = (k_iter / 2); /*itr count for first loop*/\
|
||||
@@ -1062,7 +1062,7 @@ err_t bli_trsm_small_mt_AVX512
|
||||
ymm16 = _mm256_add_pd(ymm16, ymm31);
|
||||
|
||||
#define BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
|
||||
/*K loop is broken into two seperate loops
|
||||
/*K loop is broken into two separate loops
|
||||
each loop computes k/2 iterations */ \
|
||||
\
|
||||
int itr = (k_iter / 2); /*itr count for first loop*/\
|
||||
@@ -1142,7 +1142,7 @@ err_t bli_trsm_small_mt_AVX512
|
||||
ymm16 = _mm256_add_pd(ymm16, ymm31);
|
||||
|
||||
#define BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
|
||||
/*K loop is broken into two seperate loops
|
||||
/*K loop is broken into two separate loops
|
||||
each loop computes k/2 iterations */ \
|
||||
\
|
||||
int itr = (k_iter / 2); /*itr count for first loop*/\
|
||||
|
||||
@@ -916,7 +916,7 @@ void bli_zgemmsup_cv_zen4_asm_12x4m
|
||||
const double *v = &value;
|
||||
|
||||
// Assigning the type of alpha and beta scaling
|
||||
// In order to facilitate handling special cases seperately
|
||||
// In order to facilitate handling special cases separately
|
||||
char alpha_mul_type = BLIS_MUL_DEFAULT;
|
||||
char beta_mul_type = BLIS_MUL_DEFAULT;
|
||||
|
||||
@@ -1400,7 +1400,7 @@ void bli_zgemmsup_cv_zen4_asm_12x3m
|
||||
const double *v = &value;
|
||||
|
||||
// Assigning the type of alpha and beta scaling
|
||||
// In order to facilitate handling special cases seperately
|
||||
// In order to facilitate handling special cases separately
|
||||
char alpha_mul_type = BLIS_MUL_DEFAULT;
|
||||
char beta_mul_type = BLIS_MUL_DEFAULT;
|
||||
|
||||
@@ -1819,7 +1819,7 @@ void bli_zgemmsup_cv_zen4_asm_12x2m
|
||||
const double *v = &value;
|
||||
|
||||
// Assigning the type of alpha and beta scaling
|
||||
// In order to facilitate handling special cases seperately
|
||||
// In order to facilitate handling special cases separately
|
||||
char alpha_mul_type = BLIS_MUL_DEFAULT;
|
||||
char beta_mul_type = BLIS_MUL_DEFAULT;
|
||||
|
||||
@@ -2224,7 +2224,7 @@ void bli_zgemmsup_cv_zen4_asm_12x1m
|
||||
*/
|
||||
|
||||
// Assigning the type of alpha and beta scaling
|
||||
// In order to facilitate handling special cases seperately
|
||||
// In order to facilitate handling special cases separately
|
||||
char alpha_mul_type = BLIS_MUL_DEFAULT;
|
||||
char beta_mul_type = BLIS_MUL_DEFAULT;
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ void bls_l3_thread_decorator
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
// NOTE: This calls the same function used for the conventional/large
|
||||
// code path.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
@@ -164,7 +164,7 @@ void blx_gemm_thread
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
// Check for a somewhat obscure OpenMP thread-mismatch issue.
|
||||
//bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
|
||||
@@ -160,7 +160,7 @@ TEST_OPS := dotv axpyv \
|
||||
gemm hemm herk her2k trmm trsm \
|
||||
|
||||
# Include extension API's added by AMD in operations list
|
||||
# Keeping it seperate in case it needs to be guarded by a variable
|
||||
# Keeping it separate in case it needs to be guarded by a variable
|
||||
|
||||
TEST_OPS := $(TEST_OPS) axpbyv cabs1 copyv gemm3m gemm_batch \
|
||||
gemmt imatcopy omatadd omatcopy omatcopy2 \
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
|
||||
/* Format for FILE input
|
||||
* For each input set, first line contains 'storage scheme'
|
||||
* and 'group count' seperated by space.
|
||||
* and 'group count' separated by space.
|
||||
* Following 'group_count' number of lines contains all the parameters of
|
||||
* each group separated by space in each line in the following order:
|
||||
* tA tB m n k lda ldb ldc alpha_r alpha_i beta_r beta_i group_size
|
||||
|
||||
Reference in New Issue
Block a user