Add Native Precision Tutorial, update worker strategy and README.md (#1807)

2026-04-20 14:29:22 +00:00 · 2026-01-23 18:00:13 +08:00
parent 8652346e69
commit bf4c8a690b
5 changed files with 308 additions and 2 deletions
--- a/kt-kernel/cpu_backend/worker_pool.cpp
+++ b/kt-kernel/cpu_backend/worker_pool.cpp
@@ -176,6 +176,7 @@ void InNumaPool::process_tasks(int thread_id) {
    }

    int block = (rem + worker_count - 1) / worker_count;
+    block = 1;
    int task_id = curr_.fetch_add(block, std::memory_order_acq_rel);
    if (task_id >= end_) {
      break;
--- a/kt-kernel/python/cli/commands/chat.py
+++ b/kt-kernel/python/cli/commands/chat.py
@@ -270,6 +270,7 @@ def _stream_response(
 ) -> str:
    """Generate streaming response and display in real-time."""
    response_content = ""
+    reasoning_content = ""

    try:
        stream = client.chat.completions.create(
@@ -281,8 +282,13 @@ def _stream_response(
        )

        for chunk in stream:
-            if chunk.choices[0].delta.content:
-                content = chunk.choices[0].delta.content
+            delta = chunk.choices[0].delta
+            reasoning_delta = getattr(delta, "reasoning_content", None)
+            if reasoning_delta:
+                reasoning_content += reasoning_delta
+                console.print(reasoning_delta, end="", style="dim")
+            if delta.content:
+                content = delta.content
                response_content += content
                console.print(content, end="")

--- a/kt-kernel/python/utils/amx.py
+++ b/kt-kernel/python/utils/amx.py
@@ -424,6 +424,10 @@ class NativeMoEWrapper(BaseMoEWrapper):
            if self.method == "RAWINT4":
                assert self.gate_scales[0].dtype == torch.bfloat16, "Expected bf16 scales for RAWINT4"
            elif self.method == "FP8":
+                if self.gate_scales[0].dtype != torch.float32:
+                    self.gate_scales = [t.to(torch.float32).contiguous() for t in weights["gate_scale"]]
+                    self.up_scales = [t.to(torch.float32).contiguous() for t in weights["up_scale"]]
+                    self.down_scales = [t.to(torch.float32).contiguous() for t in weights["down_scale"]]
                assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8"
            elif self.method == "FP8_PERCHANNEL":
                assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8_PERCHANNEL"