New IQ2_KT, IQ3_KT and IQ4_KT, V2 (#529)

* New iq4_kt trellis

The new trellis generates int8_t values via
sum_as_uint8_t[(ka * idx + kb) & 0x3f33f3f3f] - 126.
CUDA dequantize works.
AVX2 case Ny > 32 works, and we get 273 t/s for L3-8B.
PPL is on par or even slightly lower than original QTIP trellis.

* Something is not working with the AVX2 dot product

* New iq4_kt: CUDA MMVQ

* New iq4_kt: CUDA MMQ

* For now have only iq4_kt use the new trellis

* Fix iq2_kt that got broken along the way

* New iq4_kt: AVX2 dot product finally works

We get 13.6 t/s vs 8.4 t/s with the f16 trellis and f32 arithmetic.
Still somewhat slower than other quants, but no longer pathetic.

* New iq4_kt: fix vanilla AVX2

* New iq4_kt: NEON implementation

We get very respectable PP-512 = 120 t/s.
TG-128 is pathetic at 5.3 t/s, so 20+% slower than the f16 variant.

* New iq4_kt: slightly faster NEON

* New iq4_kt: slightly faster NEON

* New iq4_kt: faster NEON

We are now at 9.4 t/s, up from 6.6 t/s for the f16 trellis.

* Minor

* New iq4_kt trellis: not working Metal implementation

* Remove the extra 4 bytes of row meta data that is no longer used

* Cleanup

* Adding forgottent file

* Switching iq2_kt to new trellis - CUDA MMQ

* New iq2_kt: CUDA GEMV

* New iq2_kt: AVX2 dequantize

* New iq2_kt: AVX2 GEMM/GEMV

* Adding forgotten file

* New iq2_kt: NEON GEMM/GEMV

* New iq2_kt: slightly faster NEON GEMM

* New iq2_kt: Metal - very slow.

It seems Apple Silicon cannot quickly add 4 8-bit ints.
Or I don't know how to do it - but I didn't find anything
in the Metal Shading Language Specification.
So, performance is quite a bit worse than the original trellis.

* Add missing break

* Trying @louiehelm's multiplier

* CPU

* iq3_kt: use integer trellis + CUDA dequantize and MMVQ

* iq3_kt: MMQ

* iq3_kt: AVX2 GEMM

* iq3_kt: AVX2 GEMV

* The trellis quants now need super-blocks of 256, so we need a check

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-06-18 16:20:54 +03:00
committed by GitHub
parent c410cc72bb
commit d85c64428e
16 changed files with 1668 additions and 132 deletions

View File

@@ -7397,7 +7397,7 @@ void dequantize_row_ms_i2s(const void * vx, float * y, int64_t k) {
}
namespace {
template <int block_size, int group_size, int num_bits, bool is_abs = false>
template <int block_size, int group_size, int num_bits, bool is_abs = false, bool is_int = false>
class QuantizerIQKT {
static_assert(group_size == 8 || group_size == 4);
static_assert(block_size >= 8 && block_size%8 == 0);
@@ -7408,7 +7408,7 @@ public:
constexpr static int kNg = kBlockSize/kGroupSize;
constexpr static int kNblock = kSuperBlockSize/kBlockSize;
constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8
constexpr static float kScale = 31.75f;
constexpr static float kScale = is_int ? 1.f : 31.75f;
constexpr static bool kVerbose = false;
QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
@@ -7419,17 +7419,32 @@ public:
inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;
static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t kmask = 0x8fff8fff;
constexpr uint32_t km32 = 0x3b603b60;
uint32_t x = i + offset;
for (int k = 0; k < kGroupSize; ++k) {
x = ka*x + kb;
uint32_t s = (x & kmask) ^ km32;
float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
if constexpr (is_abs) result[k] = scale*std::abs(val);
else result[k] = scale*val;
if constexpr (is_int) {
constexpr uint32_t ka = 0xCBAC1FED;
uint32_t s;
auto i8 = (const int8_t *)&s;
for (int k = 0; k < kGroupSize; ++k) {
x = ka*x;
s = x & 0x3f3f3f3f;
if constexpr (is_abs) {
result[k] = scale*std::abs(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
} else {
result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
}
}
} else {
constexpr uint32_t ka = 89226354;
constexpr uint32_t kb = 64248484;
constexpr uint32_t kmask = 0x8fff8fff;
constexpr uint32_t km32 = 0x3b603b60;
for (int k = 0; k < kGroupSize; ++k) {
x = ka*x + kb;
uint32_t s = (x & kmask) ^ km32;
float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
if constexpr (is_abs) result[k] = scale*std::abs(val);
else result[k] = scale*val;
}
}
}
@@ -7478,14 +7493,15 @@ private:
float m_mid[4*kGroupSize];
};
template <int block_size, int group_size, int num_bits, bool is_abs>
QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
m_values.resize(kNumVal*kGroupSize);
float * data = m_values.data();
for (int i = 0; i < kNumVal; ++i) {
set_values(i, data, kScale, offset);
data += kGroupSize;
}
if (num_clusters == 0) return;
// Make 128 clusters.
// Note: we get a slightly better result by using 64 clusters
// at the expense of almost doubling the quantization time.
@@ -7494,8 +7510,8 @@ QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_c
m_in_cluster = finalize_clusters(num_neighbours, m_values, m_clusters, m_c_values);
}
template <int block_size, int group_size, int num_bits, bool is_abs>
std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_scale(
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_scale(
const float * xb, const float * weight, const int * best_idx) const {
float sumqx = 0, sumq2 = 0;
#ifdef __AVX2__
@@ -7527,8 +7543,8 @@ std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>:
return sumq2 > 0 ? std::make_pair(sumqx/sumq2, sumqx*sumqx/sumq2) : std::make_pair(0.f, 0.f);
}
template <int block_size, int group_size, int num_bits, bool is_abs>
float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse_scale(
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
float QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_inverse_scale(
const float * xb, const float * weight, const int * best_idx) const {
float sumqx = 0, sumx2 = 0;
#ifdef __AVX2__
@@ -7560,8 +7576,8 @@ float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse
return sumx2 > 0 ? sumqx/sumx2 : 0.f;
}
template <int block_size, int group_size, int num_bits, bool is_abs>
void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
void QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
if (!d) {
std::memset(best_idx, 0, kNg*sizeof(int));
return;
@@ -7739,8 +7755,8 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
#endif
}
template <int block_size, int group_size, int num_bits, bool is_abs>
std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::finalize_clusters(int num_neighbours,
const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
int ncluster = clusters.size()/kGroupSize;
std::vector<std::vector<int>> p_in_cluster(ncluster);
@@ -7826,8 +7842,8 @@ std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is
return p_in_cluster;
}
template <int block_size, int group_size, int num_bits, bool is_abs>
std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
constexpr int ndim = kGroupSize;
GGML_ASSERT(points.size() % ndim == 0);
int npoint = points.size() / ndim;
@@ -7995,7 +8011,7 @@ std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::clus
// ========================================== iq2_kt ====================================================
using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16>;
using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16, false, true>;
const QuantizerIQ2KT& iq2kt_quantizer() {
static std::mutex mutex;
@@ -8006,7 +8022,7 @@ const QuantizerIQ2KT& iq2kt_quantizer() {
}
void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights,
float * qtmp) {
int * all_idx) {
constexpr float kSigmaScale = 2.0f;
using Q = QuantizerIQ2KT;
@@ -8025,6 +8041,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);
float amax_row = 0;
for (int j = 0; j < n_per_row; ++j) {
amax_row = std::max(amax_row, std::abs(x[j]));
}
float amax_scale = 0, max_scale = 0;
for (int ibl = 0; ibl < nblock; ++ibl) {
@@ -8042,9 +8063,10 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float ax = std::abs(xb[j]);
amax = std::max(amax, ax);
}
quantizer.find_best_match( amax/96.f, xb, weight, best_idx);
float scale_0 = std::max(90.f, 124.f*amax/amax_row);
quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
quantizer.find_best_match(-amax/96.f, xb, weight, best_idx + Q::kNg);
quantizer.find_best_match(-amax/scale_0, xb, weight, best_idx + Q::kNg);
auto [dm, score_m] = quantizer.find_best_scale(xb, weight, best_idx + Q::kNg);
auto idx = best_idx;
@@ -8052,12 +8074,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
else {
scales[ib] = dm; idx += Q::kNg;
}
auto qt = qtmp + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
for (int ig = 0; ig < Q::kNg; ++ig) {
auto q = quantizer.values() + idx[ig]*Q::kGroupSize;
for (int j = 0; j < Q::kGroupSize; ++j) qt[j] = q[j];
qt += Q::kGroupSize;
}
for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = idx[ig];
float abs_scale = std::abs(scales[ib]);
if (abs_scale > amax_scale) {
@@ -8080,20 +8097,22 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float sumqx = 0, sumq2 = 0;
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * xb = x + ibl*Q::kSuperBlockSize;
const float * qb = qtmp + ibl*Q::kSuperBlockSize;
const float * wb = all_weights + ibl*Q::kSuperBlockSize;
auto scales = all_scales + ibl*Q::kNblock;
for (int ib = 0; ib < Q::kNblock; ++ib) {
int ls = best_index_iq4nl(iq4k_values, id*scales[ib]);
float dl = iq4k_values[ls];
for (int j = 0; j < Q::kBlockSize; ++j) {
float q = dl*qb[j];
sumqx += wb[j]*xb[j]*q;
sumq2 += wb[j]*q*q;
for (int ig = 0; ig < Q::kNg; ++ig) {
auto qb = quantizer.values() + Q::kGroupSize*all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig];
for (int j = 0; j < Q::kGroupSize; ++j) {
int jj = ig*Q::kGroupSize + j;
float q = dl*qb[j];
sumqx += wb[jj]*xb[jj]*q;
sumq2 += wb[jj]*q*q;
}
}
xb += Q::kBlockSize;
wb += Q::kBlockSize;
qb += Q::kBlockSize;
}
}
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
@@ -8129,6 +8148,26 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
float dl = d*ls;
quantizer.find_best_match(dl, xb, weight, best_idx);
auto prev_idx = all_idx + (ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize;
float mse1 = 0, mse2 = 0;
for (int ig = 0; ig < Q::kNg; ++ig) {
auto q1 = quantizer.values() + Q::kGroupSize*prev_idx[ig];
auto q2 = quantizer.values() + Q::kGroupSize*best_idx[ig];
for (int j = 0; j < Q::kGroupSize; ++j) {
int jj = ig*Q::kGroupSize + j;
float diff1 = xb[jj] - dl*q1[j];
float diff2 = xb[jj] - dl*q2[j];
mse1 += weight[jj]*diff1*diff1;
mse2 += weight[jj]*diff2*diff2;
}
}
if (mse1 < mse2) {
for (int ig = 0; ig < Q::kNg; ++ig) best_idx[ig] = prev_idx[ig];
} else {
for (int ig = 0; ig < Q::kNg; ++ig) prev_idx[ig] = best_idx[ig];
}
for (int j = 0; j < Q::kNg; ++j) {
qs[j] = best_idx[j];
auto xl = xb + Q::kGroupSize*j;
@@ -8196,10 +8235,10 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row);
std::vector<float> scales(n_per_row/QuantizerIQ2KT::kBlockSize);
std::vector<float> weights(n_per_row);
std::vector<float> xtmp(n_per_row);
std::vector<int> idx(n_per_row/QuantizerIQ2KT::kGroupSize);
char * qrow = (char *)dst;
for (int64_t row = 0; row < nrows; ++row) {
quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), xtmp.data());
quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), idx.data());
src += n_per_row;
qrow += row_size;
}
@@ -8209,7 +8248,7 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq2_kt(const block_iq2_kt * x, float * y, int64_t k) {
assert(k % QuantizerIQ2KT::kSuperBlockSize == 0);
#ifdef __AVX2__
if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
//if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
#endif
const int nb = k / QuantizerIQ2KT::kSuperBlockSize;
const float * dptr = (const float *)x;
@@ -8254,7 +8293,7 @@ void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
namespace {
using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true>;
using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true, true>;
const QuantizerIQ3KT& iq3kt_quantizer() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
@@ -8465,7 +8504,7 @@ size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq3_kt(const block_iq3_kt * x, float * y, int64_t k) {
#ifdef __AVX2__
if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
//if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
#endif
using Q = QuantizerIQ3KT;
constexpr int kNumGroups = Q::kSuperBlockSize/Q::kGroupSize;
@@ -8521,7 +8560,7 @@ void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
namespace{
using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;
using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15, false, true>;
const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
static std::mutex mutex;
@@ -8536,6 +8575,14 @@ const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
return *quantizer1;
}
const QuantizerIQ4KT& iq4kt_dequantizer() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
static std::unique_ptr<QuantizerIQ4KT> dequantizer;
if (!dequantizer) dequantizer = std::make_unique<QuantizerIQ4KT>(0, 0, 4096);
return *dequantizer;
}
void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights) {
constexpr float kSigmaScale = 2.0f;
@@ -8546,7 +8593,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
float * dptr = (float *)vy;
block_iq4_kt * y = (block_iq4_kt *)(dptr + 2);
block_iq4_kt * y = (block_iq4_kt *)(dptr + 1);
auto& quantizer1 = iq4kt_quantizer();
auto& quantizer2 = iq4kt_quantizer(true);
@@ -8555,13 +8602,10 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);
float amax_row = 0, row_av = 0;
float amax_row = 0;
for (int j = 0; j < n_per_row; ++j) {
row_av += x[j];
amax_row = std::max(amax_row, std::abs(x[j]));
}
row_av /= n_per_row;
dptr[1] = row_av;
if (!amax_row) {
dptr[0] = 0.f;
std::memset(y, 0, nblock*sizeof(block_iq4_kt));
@@ -8584,7 +8628,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
float amax = 0;
for (int j = 0; j < Q::kBlockSize; ++j) {
xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
xaux[j] = xbl[ib*Q::kBlockSize+j];
float ax = std::abs(xaux[j]);
amax = std::max(amax, ax);
}
@@ -8593,7 +8637,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
continue;
}
float best = 0;
float scale_0 = std::max(92.f, 127.f*amax/amax_row);
float scale_0 = std::max(90.f, 124.f*amax/amax_row);
for (int itry = -kNtry; itry <= kNtry; ++itry) {
quantizer1.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
auto [dp, score_p] = quantizer1.find_best_scale(xaux, weight, best_idx);
@@ -8664,7 +8708,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
for (int ib = 0; ib < Q::kNblock; ++ib) {
auto& quantizer = y[ibl].qs[ib] & 1 ? quantizer2 : quantizer1;
const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j];
int ls = nearest_int(id*scales[ib]);
ls = std::min(ls, 63);
*(uint8_t *)(shb + ib) = ((ls + 64) << 1) | (shb[ib] & 1);
@@ -8724,7 +8768,7 @@ size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
#ifdef __AVX2__
if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
//if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
#endif
using Q = QuantizerIQ4KT;
assert(k % Q::kSuperBlockSize == 0);
@@ -8732,23 +8776,20 @@ void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
const int nb = k / Q::kSuperBlockSize;
const float * dptr = (const float *)x;
const float d = dptr[0] * Q::kScale;
const float row_av = dptr[1];
x = (const block_iq4_kt *)(dptr + 2);
auto& deq = iq4kt_quantizer();
x = (const block_iq4_kt *)(dptr + 1);
auto& deq = iq4kt_dequantizer();
for (int ibl = 0; ibl < nb; ++ibl) {
auto shb = x[ibl].qs;
auto ql = (const uint8_t *)(shb + Q::kNblock);
auto qh = ql + kNumGroups;
for (int ib = 0; ib < Q::kNblock; ++ib) {
int offset = shb[ib] & 1 ? 32768 + 4096 : 4096;
//auto& deq = shb[ib] & 1 ? deq2 : deq1;
int ls = int((shb[ib] & 0xff) >> 1) - 64;
float sl = d * ls;
for (int ig = 0; ig < Q::kNg; ++ig) {
int jj = ib*Q::kNg+ig;
uint16_t idx = ql[jj] | ((qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00) | (((shb[ib] >> (8 + 3*ig)) & 7) << 12);
deq.set_values(idx, y, sl, offset);
for (int j = 0; j < Q::kGroupSize; ++j) y[j] += row_av;
y += Q::kGroupSize;
}
}