Merge remote-tracking branch 'origin/main' into andrewkchan/try_trellis

2026-03-05 19:40:19 +00:00 · 2025-05-20 06:52:54 +00:00
parent 9ceef4941b 2ec2229f2e
commit 46712586b3
39 changed files with 2904 additions and 447 deletions
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -60,7 +60,7 @@ private:
    int                                    m_last_call = 0;
    int                                    m_last_layer = 9999;
    int                                    m_last_ffn = -1;
-    std::vector<float>                     m_src1_data;
+    std::vector<char>                      m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
    std::vector<float>                     m_last_input;
    std::vector<float>                     m_ffn_input;
@@ -189,11 +189,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);

    if (!is_host) {
-        m_src1_data.resize(ggml_nelements(src1));
-        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
+        auto nbytes = ggml_nbytes(src1);
+        m_src1_data.resize(nbytes);
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, nbytes);
    }

-    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
+    const float * data = is_host ? (const float *) src1->data : (const float *)m_src1_data.data();

    if (m_collect_lsim) {
        if (wname.find(".ffn_") != std::string::npos) {
@@ -331,10 +332,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
        auto & e = m_stats[wname];
        if (e.values.empty()) {
-            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
+            if (src0->ne[3] > 1) {
+                fprintf(stderr, "Unsupported 4D tensor %s\n", wname.c_str());
+                exit(1);
+            }
+            // If we have a 3D tensor as it is the case for the attn_k_b and attn_v_b for DeepSeek MLA models,
+            // than we need to compute the imatrix for each head, and not just one imatrx for all heads.
+            // Hence, the storage we need is src0->ne[0]*src0->ne[2].
+            e.values.resize(src0->ne[0]*src0->ne[2], 0);
+            e.counts.resize(src0->ne[0]*src0->ne[2], 0);
        }
-        else if (e.values.size() != (size_t)src1->ne[0]) {
+        else if (e.values.size() != (size_t)(src0->ne[0]*src0->ne[2])) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
@@ -342,14 +350,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
-        for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
-            const float * x = data + row * src1->ne[0];
-            for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
-                    exit(1);
+        int rk2 = src1->ne[2]/src0->ne[2];
+        for (int i12 = 0; i12 < (int)src1->ne[2]; ++i12) {  // i.e., loop over attention heads for MLA models
+            int i02 = i12/rk2;
+            auto values = e.values.data() + i02*src0->ne[0];
+            auto counts = e.counts.data() + i02*src0->ne[0];
+            for (int i11 = 0; i11 < (int)src1->ne[1]; ++i11) {
+                const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    values[j] += x[j]*x[j];
+                    counts[j]++;
+                    if (!std::isfinite(values[j])) {
+                        fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                        exit(1);
+                    }
                }
            }
        }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -69,7 +69,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
    { "IQ4_KS",   LLAMA_FTYPE_MOSTLY_IQ4_KS,   " 4.25 bpw non-linear quantization", },
    { "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
+    { "IQ5_KS_R4",LLAMA_FTYPE_MOSTLY_IQ5_KS_R4,"IQ5_KS repacked", },
    { "IQ4_KSS",  LLAMA_FTYPE_MOSTLY_IQ4_KSS,  " 4.0 bpw non-linear quantization",  },
+    { "IQ5_KS",   LLAMA_FTYPE_MOSTLY_IQ5_KS,   " 5.25 bpw non-linear quantization", },
    { "IQ2_K",    LLAMA_FTYPE_MOSTLY_IQ2_K,    " 2.375 bpw non-linear quantization",},
    { "IQ2_K_R4", LLAMA_FTYPE_MOSTLY_IQ2_K_R4, "IQ2_K repacked",},
    { "IQ2_KS",   LLAMA_FTYPE_MOSTLY_IQ2_KS,   " 2.1875 bpw non-linear quantization",},
--- a/examples/sweep-bench/sweep-bench.cpp
+++ b/examples/sweep-bench/sweep-bench.cpp
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);

    // warm up
-    {
+    if (params.warmup) {
        llama_batch_add(batch, bos, 0, { 0 }, false);

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -115,6 +115,22 @@ int main(int argc, char ** argv) {
            return 1;
        }
    }
+    if (params.batch_warmup) {
+        // clean up KV cache after generation
+        llama_kv_cache_seq_rm(ctx, 0, params.n_ubatch, -1);
+
+        // prepare batch of pp size for prompt processing performance measurement
+        llama_batch_clear(batch);
+
+        for (unsigned int i = 0; i < params.n_ubatch; ++i) {
+            llama_batch_add(batch, std::rand() % n_vocab, i, { 0 }, false);
+        }
+
+        if (!decode_helper(ctx, batch, ctx_params.n_ubatch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+    }

    llama_batch_clear(batch);
    llama_kv_cache_clear(ctx);