[fix]: fix precision

2026-04-20 14:29:22 +00:00 · 2025-11-07 14:56:05 +00:00
parent 32fab532c6
commit 7b88bb3d39
3 changed files with 30 additions and 51 deletions
--- a/kt-kernel/operators/amx/la/amx_buffers.hpp
+++ b/kt-kernel/operators/amx/la/amx_buffers.hpp
@@ -5,7 +5,9 @@
 #include <cstdint>
 #include <cstdio>
 #include <limits>
+#include <vector>

+#include "amx_config.hpp"
 #include "amx_utils.hpp"
 #include "llama.cpp/ggml-impl.h"
 #include "pack.hpp"
@@ -46,41 +48,16 @@ struct BufferAImpl {
    assert(ith == 0 && nth == 1);
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int i = 0; i < M_STEP && m_begin + i < m; i++) {
-        __m512 amax_v0 = _mm512_setzero_ps();
-        __m512 amax_v1 = _mm512_setzero_ps();
-        __m512 amax_v2 = _mm512_setzero_ps();
-        __m512 amax_v3 = _mm512_setzero_ps();
-        __m512 amax_v4 = _mm512_setzero_ps();
-        __m512 amax_v5 = _mm512_setzero_ps();
-        __m512 amax_v6 = _mm512_setzero_ps();
-        __m512 amax_v7 = _mm512_setzero_ps();
-        for (int j = 0; j < k; j += 128) {
-          __m512 f0, f1, f2, f3, f4, f5, f6, f7;
-          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 0), &f0, &f1);
-          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 32), &f2, &f3);
-          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 64), &f4, &f5);
-          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j + 96), &f6, &f7);
-          amax_v0 = vector_abs_max(amax_v0, f0);
-          amax_v1 = vector_abs_max(amax_v1, f1);
-          amax_v2 = vector_abs_max(amax_v2, f2);
-          amax_v3 = vector_abs_max(amax_v3, f3);
-          amax_v4 = vector_abs_max(amax_v4, f4);
-          amax_v5 = vector_abs_max(amax_v5, f5);
-          amax_v6 = vector_abs_max(amax_v6, f6);
-          amax_v7 = vector_abs_max(amax_v7, f7);
+        float amax = 0.0f;
+        for (int j = 0; j < k; j += 32) {
+          __m512 f0, f1;
+          avx512_32xbf16_to_32xfp32((__m512i*)(src + (m_begin + i) * k + j), &f0, &f1);
+          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f0)));
+          amax = MAX(amax, _mm512_reduce_max_ps(_mm512_abs_ps(f1)));
        }
-        amax_v0 = vector_abs_max(amax_v0, amax_v1);
-        amax_v2 = vector_abs_max(amax_v2, amax_v3);
-        amax_v4 = vector_abs_max(amax_v4, amax_v5);
-        amax_v6 = vector_abs_max(amax_v6, amax_v7);
-        amax_v0 = vector_abs_max(amax_v0, amax_v2);
-        amax_v4 = vector_abs_max(amax_v4, amax_v6);
-        amax_v0 = vector_abs_max(amax_v0, amax_v4);
-        float amax = _mm512_reduce_max_ps(amax_v0);
        d[m_begin + i] = amax / ((1 << 7) - 1);
      }
    }
-
    int m_block_size = (m + M_STEP - 1) / M_STEP * M_STEP;
    for (int m_begin = 0; m_begin < m; m_begin += M_STEP) {
      for (int k_block_begin = 0; k_block_begin < k; k_block_begin += K_BLOCK) {
--- a/kt-kernel/operators/amx/la/amx_kernels.hpp
+++ b/kt-kernel/operators/amx/la/amx_kernels.hpp
@@ -1180,9 +1180,9 @@ struct GemmKernel224Int4 {
  }

  static void load_a(dt* a, size_t lda) {
-#ifdef __AMX__
-    _tile_stream_loadd(0, a, lda);
-    _tile_stream_loadd(1, offset_pointer(a, lda * TILE_M), lda);
+#ifdef HAVE_AMX
+    _tile_loadd(0, a, lda);
+    _tile_loadd(1, offset_pointer(a, lda * TILE_M), lda);
 #else
    (void)a;
    (void)lda;
--- a/kt-kernel/operators/amx/moe.hpp
+++ b/kt-kernel/operators/amx/moe.hpp
@@ -29,7 +29,10 @@
 #include "../../cpu_backend/worker_pool.h"
 #include "../moe-tp.hpp"
 #include "la/amx.hpp"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
 #include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"

 template <class T>
 class AMX_MOE_TP {
@@ -261,6 +264,8 @@ class AMX_MOE_TP {
  ~AMX_MOE_TP() {
    // shared_mem_buffer_numa.dealloc(this);
  }
+  // pack and quant the weights
+  void pack_weights() {}
  void load_weights() {
    auto pool = config_.pool->get_subpool(tp_part_idx);
    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
@@ -269,7 +274,7 @@ class AMX_MOE_TP {
          config_.expert_num, nullptr,
          [this, physical_to_logical_map](int expert_id) {
            // printf("Load layer %d [%d/%d]\n", config_.layer_idx, expert_id, config_.expert_num);
-            uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_id);
+            uint64_t logical_expert_id = expert_id;
            {
              size_t scale_size = config_.intermediate_size * sizeof(float);
              size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size) - scale_size;
@@ -307,7 +312,7 @@ class AMX_MOE_TP {
        std::cout << "Loading from " << prefix << std::endl;
        for (int task_id = 0; task_id < config_.expert_num * mat_type_all * mat_split; task_id++) {
          int64_t expert_idx = task_id / (mat_type_all * mat_split);
-          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
+          uint64_t logical_expert_id = expert_idx;
          uint8_t mat_class = (task_id % (mat_type_all * mat_split)) / mat_split;
          uint8_t mat_split_idex = task_id % mat_split;
          if (mat_class == 0) {  // the up matrix
@@ -341,32 +346,30 @@ class AMX_MOE_TP {
        }
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
-            [this, nth, physical_to_logical_map](int task_id) {
+            [this, nth](int task_id) {
              int64_t expert_idx = task_id / nth;
-              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // gate part
-              gate_bb_[logical_expert_id]->from_mat(
-                  (ggml_bf16_t*)config_.gate_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
-                  ith, nth);
+              gate_bb_[expert_idx]->from_mat(
+                  (ggml_bf16_t*)config_.gate_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith,
+                  nth);
              // up part
-              up_bb_[logical_expert_id]->from_mat(
-                  (ggml_bf16_t*)config_.up_proj + logical_expert_id * config_.intermediate_size * config_.hidden_size,
-                  ith, nth);
+              up_bb_[expert_idx]->from_mat(
+                  (ggml_bf16_t*)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith,
+                  nth);
            },
            nullptr);

        nth = T::recommended_nth(config_.hidden_size);
        pool->do_work_stealing_job(
            nth * config_.expert_num, nullptr,
-            [this, nth, physical_to_logical_map](int task_id) {
+            [this, nth](int task_id) {
              int64_t expert_idx = task_id / nth;
-              uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
              int ith = task_id % nth;
              // down part
-              down_bb_[logical_expert_id]->from_mat(
-                  (ggml_bf16_t*)config_.down_proj + logical_expert_id * config_.hidden_size * config_.intermediate_size,
-                  ith, nth);
+              down_bb_[expert_idx]->from_mat(
+                  (ggml_bf16_t*)config_.down_proj + expert_idx * config_.hidden_size * config_.intermediate_size, ith,
+                  nth);
              // printf("load down, expert %ld, ith %d, total nth %d\n", expert_idx, ith, nth);
            },
            nullptr);
@@ -378,9 +381,8 @@ class AMX_MOE_TP {
      if (config_.save) {
        pool->do_work_stealing_job(
            config_.expert_num * mat_type_all, nullptr,
-            [this, physical_to_logical_map](int task_id) {
+            [this](int task_id) {
              int64_t expert_idx = task_id / mat_type_all;
-              expert_idx = expert_map(physical_to_logical_map, expert_idx);
              uint8_t mat_class = task_id % mat_type_all;
              if (mat_class == 0) {  // the up matrix
                size_t size = T::BufferB::required_size(config_.intermediate_size, config_.hidden_size);