mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-08 07:20:12 +00:00
iqk_mul_mat: use block_q8_0_x4 also for AVX2
This commit is contained in:
@@ -948,7 +948,15 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||
}
|
||||
}
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
|
||||
int nb4 = 4*(nb/4);
|
||||
#ifdef __AVX2__
|
||||
const bool pack = true;
|
||||
#else
|
||||
const bool pack = false;
|
||||
#endif
|
||||
for (int i = 0; i < nb; i++) {
|
||||
int i4 = i/4, ir = i%4;
|
||||
// Load elements into 4 AVX vectors
|
||||
__m256 v0 = _mm256_loadu_ps( x );
|
||||
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
||||
@@ -970,7 +978,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||
|
||||
// Quantize these floats
|
||||
const float d = maxScalar / 127.f;
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
if (pack && i < nb4) {
|
||||
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
|
||||
} else {
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
}
|
||||
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
||||
const __m256 mul = _mm256_set1_ps( id );
|
||||
|
||||
@@ -1005,7 +1017,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
||||
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
||||
|
||||
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
||||
if (i < nb4) {
|
||||
_mm256_storeu_si256((__m256i *)y4[i4].qs + ir, i0);
|
||||
} else {
|
||||
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
||||
}
|
||||
#else
|
||||
// Since we don't have in AVX some necessary functions,
|
||||
// we split the registers in half and call AVX2 analogs from SSE
|
||||
|
||||
@@ -1749,16 +1749,39 @@ struct UnsignedDot {
|
||||
template <typename Q8, typename Dot> struct Sum4 {
|
||||
Dot dot;
|
||||
inline __m256i compute(const __m256i * qx, const Q8 * y) const {
|
||||
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
|
||||
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
|
||||
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
|
||||
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
|
||||
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
|
||||
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
|
||||
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
if constexpr (std::is_same_v<Q8, block_q8_0>) {
|
||||
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
|
||||
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y4->qs+0));
|
||||
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y4->qs+1));
|
||||
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y4->qs+2));
|
||||
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y4->qs+3));
|
||||
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
|
||||
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
|
||||
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
} else {
|
||||
const __m256i p0 = dot.compute(qx[0], _mm256_loadu_si256((const __m256i *)y[0].qs));
|
||||
const __m256i p1 = dot.compute(qx[1], _mm256_loadu_si256((const __m256i *)y[1].qs));
|
||||
const __m256i p2 = dot.compute(qx[2], _mm256_loadu_si256((const __m256i *)y[2].qs));
|
||||
const __m256i p3 = dot.compute(qx[3], _mm256_loadu_si256((const __m256i *)y[3].qs));
|
||||
const __m256i p01 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p0, p1)); // 0,0, 1,1, 0,0, 1,1
|
||||
const __m256i p23 = _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p2, p3)); // 2,2, 3,3, 2,2, 3,3
|
||||
return _mm256_madd_epi16(dot.helper.m1, _mm256_packs_epi32(p01, p23)); // 0,1,2,3, 0,1,2,3
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ScaleHelperQ8_0 {
|
||||
inline __m128 prepare4(const block_q8_0 * y) {
|
||||
const block_q8_0_x4 * y4 = (const block_q8_0_x4 *)y;
|
||||
return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)y4->d));
|
||||
}
|
||||
inline __m128 prepare4(__m128 other_scales, const block_q8_0 * y) {
|
||||
return _mm_mul_ps(other_scales, prepare4(y));
|
||||
}
|
||||
template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
|
||||
template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
|
||||
};
|
||||
|
||||
struct ScaleHelperQ_0 {
|
||||
ggml_half scales8[4];
|
||||
template <typename Q>
|
||||
@@ -1893,11 +1916,11 @@ void mul_mat_qX_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
|
||||
Q8<nrc_y, block_q8_0> q8(info);
|
||||
int nb = n/Unpacker::block_size();
|
||||
if (nb%4 == 0) {
|
||||
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ_0, block_q8_0, nrc_y>(
|
||||
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, true>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
} else {
|
||||
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ_0, block_q8_0, nrc_y>(
|
||||
mul_mat_qX_q8_Helper<Unpacker, Sum4Type0, AccumType0<nrc_y, false>, ScaleHelperQ8_0, block_q8_0, nrc_y>(
|
||||
nb, vx, bx, info, q8.y, nrc_x
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user