mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-20 14:29:22 +00:00
Add Native Precision Tutorial, update worker strategy and README.md (#1807)
This commit is contained in:
@@ -176,6 +176,7 @@ void InNumaPool::process_tasks(int thread_id) {
|
||||
}
|
||||
|
||||
int block = (rem + worker_count - 1) / worker_count;
|
||||
block = 1;
|
||||
int task_id = curr_.fetch_add(block, std::memory_order_acq_rel);
|
||||
if (task_id >= end_) {
|
||||
break;
|
||||
|
||||
@@ -270,6 +270,7 @@ def _stream_response(
|
||||
) -> str:
|
||||
"""Generate streaming response and display in real-time."""
|
||||
response_content = ""
|
||||
reasoning_content = ""
|
||||
|
||||
try:
|
||||
stream = client.chat.completions.create(
|
||||
@@ -281,8 +282,13 @@ def _stream_response(
|
||||
)
|
||||
|
||||
for chunk in stream:
|
||||
if chunk.choices[0].delta.content:
|
||||
content = chunk.choices[0].delta.content
|
||||
delta = chunk.choices[0].delta
|
||||
reasoning_delta = getattr(delta, "reasoning_content", None)
|
||||
if reasoning_delta:
|
||||
reasoning_content += reasoning_delta
|
||||
console.print(reasoning_delta, end="", style="dim")
|
||||
if delta.content:
|
||||
content = delta.content
|
||||
response_content += content
|
||||
console.print(content, end="")
|
||||
|
||||
|
||||
@@ -424,6 +424,10 @@ class NativeMoEWrapper(BaseMoEWrapper):
|
||||
if self.method == "RAWINT4":
|
||||
assert self.gate_scales[0].dtype == torch.bfloat16, "Expected bf16 scales for RAWINT4"
|
||||
elif self.method == "FP8":
|
||||
if self.gate_scales[0].dtype != torch.float32:
|
||||
self.gate_scales = [t.to(torch.float32).contiguous() for t in weights["gate_scale"]]
|
||||
self.up_scales = [t.to(torch.float32).contiguous() for t in weights["up_scale"]]
|
||||
self.down_scales = [t.to(torch.float32).contiguous() for t in weights["down_scale"]]
|
||||
assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8"
|
||||
elif self.method == "FP8_PERCHANNEL":
|
||||
assert self.gate_scales[0].dtype == torch.float32, "Expected float32 scales for FP8_PERCHANNEL"
|
||||
|
||||
Reference in New Issue
Block a user