From 12949c8acd9120a5c9d75a6b53ce667f586a82de Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 01:47:14 +0800
Subject: [PATCH 01/17] fix default options

---
 .github/workflows/install.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index c8a87a49..d8f199a0 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -6,7 +6,7 @@ on:
       job_to_run:
         description: "Which job to run?"
         required: true
-        default: "install&test"
+        default: "install-test"
         type: choice
         options:
           - create-install-test

From 0f1684c28d306aed40198dcc959fbc63977cef09 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:31:19 +0800
Subject: [PATCH 02/17] local chat for cicd test

---
 ktransformers/local_chat_test.py | 171 +++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 ktransformers/local_chat_test.py

diff --git a/ktransformers/local_chat_test.py b/ktransformers/local_chat_test.py
new file mode 100644
index 00000000..2927fe03
--- /dev/null
+++ b/ktransformers/local_chat_test.py
@@ -0,0 +1,171 @@
+"""
+Description  :  
+Author       : Boxin Zhang, Azure-Tang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+
+import os
+import platform
+import sys
+
+project_dir = os.path.dirname(os.path.dirname(__file__))
+sys.path.insert(0, project_dir)
+import torch
+import logging
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    TextStreamer,
+)
+import json
+import fire
+from ktransformers.optimize.optimize import optimize_and_load_gguf
+from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
+from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
+from ktransformers.models.modeling_llama import LlamaForCausalLM
+from ktransformers.models.modeling_mixtral import MixtralForCausalLM
+from ktransformers.util.utils import prefill_and_generate, get_compute_capability
+from ktransformers.server.config.config import Config
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+
+custom_models = {
+    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
+    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
+    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+    "LlamaForCausalLM": LlamaForCausalLM,
+    "MixtralForCausalLM": MixtralForCausalLM,
+}
+
+ktransformer_rules_dir = (
+    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
+)
+default_optimize_rules = {
+    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
+    "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V3-Chat.yaml",
+    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
+    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
+    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
+}
+
+
+def local_chat(
+    model_path: str | None = None,
+    optimize_config_path: str = None,
+    gguf_path: str | None = None,
+    max_new_tokens: int = 1000,
+    cpu_infer: int = Config().cpu_infer,
+    use_cuda_graph: bool = True,
+    prompt_file : str | None = None,
+    mode: str = "normal",
+    force_think: bool = False,
+    chunk_prefill_size: int = 8192
+):
+
+    torch.set_grad_enabled(False)
+
+    Config().cpu_infer = cpu_infer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    if mode == 'long_context':
+        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(config.torch_dtype)
+
+    with torch.device("meta"):
+        if config.architectures[0] in custom_models:
+            print("using custom modeling_xxx.py.")
+            if (
+                "Qwen2Moe" in config.architectures[0]
+            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
+                config._attn_implementation = "flash_attention_2"
+            if "Llama" in config.architectures[0]:
+                config._attn_implementation = "eager"
+            if "Mixtral" in config.architectures[0]:
+                config._attn_implementation = "flash_attention_2"
+
+            model = custom_models[config.architectures[0]](config)
+        else:
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=True, attn_implementation="flash_attention_2"
+            )
+
+    if optimize_config_path is None:
+        if config.architectures[0] in default_optimize_rules:
+            print("using default_optimize_rule for", config.architectures[0])
+            optimize_config_path = default_optimize_rules[config.architectures[0]]
+        else:
+            optimize_config_path = input(
+                "please input the path of your rule file(yaml file containing optimize rules):"
+            )
+
+    if gguf_path is None:
+        gguf_path = input(
+            "please input the path of your gguf file(gguf file in the dir containing input gguf file must all belong to current model):"
+        )
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+    
+    try:
+        model.generation_config = GenerationConfig.from_pretrained(model_path)
+    except Exception as e:
+        print(f"generation config can't auto create, make default. Message: {e}")
+        gen_config = GenerationConfig(
+            temperature=0.6,
+            top_p=0.95,
+            do_sample=True
+        )
+        model.generation_config = gen_config
+    # model.generation_config = GenerationConfig.from_pretrained(model_path)
+    if model.generation_config.pad_token_id is None:
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    model.eval()
+    logging.basicConfig(level=logging.INFO)
+
+    system = platform.system()
+    if system == "Windows":
+        os.system("cls")
+    else:
+        os.system("clear")
+
+    if prompt_file != None:
+        assert os.path.isfile(prompt_file), "prompt file not exist"
+        print(f"prompt file is {prompt_file}")
+        content = open(prompt_file, "r").read()
+    else:
+        content = "Please write a piece of quicksort code in C++."
+
+    print('Start Testing...(1 round)')
+    print('Prompt:', content)
+
+    while True:
+        messages = [{"role": "user", "content": content}]
+        input_tensor = tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        )
+        if force_think:
+            token_thinks = torch.tensor([tokenizer.encode("<think>\\n",add_special_tokens=False)],device=input_tensor.device)
+            input_tensor = torch.cat(
+                [input_tensor, token_thinks], dim=1
+            )
+        if mode == 'long_context':
+            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
+            "please change max_seq_len in  ~/.ktransformers/config.yaml"
+        
+        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
+            generated = prefill_and_generate(
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim
+            )
+        else:
+            generated = prefill_and_generate(
+                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
+            )
+        break
+
+if __name__ == "__main__":
+    fire.Fire(local_chat)

From 9812d57c1196856f9f6c526d0ad0191cad403f6b Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:31:49 +0800
Subject: [PATCH 03/17] fix typo, logging to file

---
 .github/workflows/install.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index d8f199a0..c16dda83 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -60,6 +60,11 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
-          python ktransformers/local_chat.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 100 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/promptsbook.txt
-          DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 100 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt
+          echo "Running Local Chat 1"
+          python ktransformers/local_chat.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
+          sed -n '/Prompt:,$p' log1.txt
+          echo "Running Local Chat 2"
+          python ktransformers/local_chat.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
+          sed -n '/Prompt:,$p' log2.txt
+
       - run: echo "This job's status is ${{ job.status }}."

From 57cf449a97097045b560001d1dcb9ce5754be42f Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:35:56 +0800
Subject: [PATCH 04/17] fix command

---
 .github/workflows/install.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index c16dda83..cce443b2 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -61,10 +61,10 @@ jobs:
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
           echo "Running Local Chat 1"
-          python ktransformers/local_chat.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
+          python ktransformers/local_chat-test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
           sed -n '/Prompt:,$p' log1.txt
           echo "Running Local Chat 2"
-          python ktransformers/local_chat.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
+          python ktransformers/local_chat-test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
           sed -n '/Prompt:,$p' log2.txt
 
       - run: echo "This job's status is ${{ job.status }}."

From 129e013b41133e9bf236642fa43362e68623716a Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:36:37 +0800
Subject: [PATCH 05/17] rename cicd

---
 .github/workflows/install.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index cce443b2..fce549cb 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -1,5 +1,5 @@
-name: Install and Test KTransformers
-run-name: Install and Test KTransformers
+name: Install / Test KTransformers
+run-name: Install / Test KTransformers
 on:
   workflow_dispatch:
     inputs:

From a31e09969f8423fd9cd93130e00a0376eeffe024 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:37:08 +0800
Subject: [PATCH 06/17] fix typo

---
 .github/workflows/install.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index fce549cb..a58b426c 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -61,10 +61,10 @@ jobs:
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
           echo "Running Local Chat 1"
-          python ktransformers/local_chat-test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
           sed -n '/Prompt:,$p' log1.txt
           echo "Running Local Chat 2"
-          python ktransformers/local_chat-test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
           sed -n '/Prompt:,$p' log2.txt
 
       - run: echo "This job's status is ${{ job.status }}."

From 0be19c39e97fc81e3c5113dacfee7ae0832b0833 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:37:54 +0800
Subject: [PATCH 07/17] change cicd option default

---
 .github/workflows/install.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index a58b426c..924feb2f 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -6,7 +6,7 @@ on:
       job_to_run:
         description: "Which job to run?"
         required: true
-        default: "install-test"
+        default: "test"
         type: choice
         options:
           - create-install-test

From f21ea700f30979dcd9ffb3150264316e9085a65a Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:45:35 +0800
Subject: [PATCH 08/17] fix term

---
 .github/workflows/install.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 924feb2f..9b1522e2 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -53,6 +53,8 @@ jobs:
           git submodule update
           bash install.sh
       - name: Test Local Chat
+        env:
+          TERM: xterm-256color
         run: |
           source /home/qujing3/anaconda3/etc/profile.d/conda.sh
           conda activate ktransformers-dev

From 2ed4dff85d70a357b055446839eaa0ecbfc15c6c Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:51:03 +0800
Subject: [PATCH 09/17] fix command typo

---
 .github/workflows/install.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 9b1522e2..610368e6 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -64,9 +64,9 @@ jobs:
           cd ${{ github.workspace }}
           echo "Running Local Chat 1"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          sed -n '/Prompt:,$p' log1.txt
+          sed -n '/Prompt:/,$p' log1.txt
           echo "Running Local Chat 2"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
-          sed -n '/Prompt:,$p' log2.txt
+          sed -n '/Prompt:/,$p' log2.txt
 
       - run: echo "This job's status is ${{ job.status }}."

From 336b5dd59024ee5434fe8daabfa0762a68b63e60 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 02:55:36 +0800
Subject: [PATCH 10/17] fix sed command

---
 .github/workflows/install.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 610368e6..29caecab 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -62,11 +62,11 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
-          echo "Running Local Chat 1"
+          echo "Running Local Chat 1...(book.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          sed -n '/Prompt:/,$p' log1.txt
-          echo "Running Local Chat 2"
+          echo $(sed -n '/Prompt:/,$p' log1.txt)
+          echo "Running Local Chat 2...(chinese.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
-          sed -n '/Prompt:/,$p' log2.txt
+          echo $(sed -n '/Prompt:/,$p' log2.txt)
 
       - run: echo "This job's status is ${{ job.status }}."

From 9d19b7b4d4c03029fca2048638b37eea791fbc68 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 03:03:38 +0800
Subject: [PATCH 11/17] fix sed

---
 .github/workflows/install.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 29caecab..42714f3f 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -56,6 +56,7 @@ jobs:
         env:
           TERM: xterm-256color
         run: |
+          set -e
           source /home/qujing3/anaconda3/etc/profile.d/conda.sh
           conda activate ktransformers-dev
           export PATH=/usr/local/cuda-12.4/bin:$PATH
@@ -64,9 +65,11 @@ jobs:
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          echo $(sed -n '/Prompt:/,$p' log1.txt)
+          output=$(sed -n '/Prompt:/,$p' log1.txt)
+          echo "$output"
           echo "Running Local Chat 2...(chinese.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
-          echo $(sed -n '/Prompt:/,$p' log2.txt)
+          output=$(sed -n '/Prompt:/,$p' log2.txt)
+          echo "$output"
 
       - run: echo "This job's status is ${{ job.status }}."

From 6385308ff049b231b5c544993de38585af8c01d0 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 03:11:26 +0800
Subject: [PATCH 12/17] replace sed with awk

---
 .github/workflows/install.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 42714f3f..e0c89aea 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -65,11 +65,11 @@ jobs:
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          output=$(sed -n '/Prompt:/,$p' log1.txt)
+          output=$(awk '/Prompt:/ {found=1} found' log1.txt) || exit_code=$?
           echo "$output"
           echo "Running Local Chat 2...(chinese.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
-          output=$(sed -n '/Prompt:/,$p' log2.txt)
+          output=$(awk '/Prompt:/ {found=1} found' log2.txt) || exit_code=$?
           echo "$output"
 
       - run: echo "This job's status is ${{ job.status }}."

From 570c98c52db6ff0b25700e630f14c6f1e68e68c8 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 03:17:17 +0800
Subject: [PATCH 13/17] remove output test

---
 .github/workflows/install.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index e0c89aea..4595418d 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -65,9 +65,6 @@ jobs:
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          output=$(awk '/Prompt:/ {found=1} found' log1.txt) || exit_code=$?
-          echo "$output"
-          echo "Running Local Chat 2...(chinese.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
           output=$(awk '/Prompt:/ {found=1} found' log2.txt) || exit_code=$?
           echo "$output"

From 0899b7dde6ef371bd0e047c3b2e6630d3c7b41df Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 03:17:35 +0800
Subject: [PATCH 14/17] remove file output est

---
 .github/workflows/install.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 4595418d..6b839340 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -64,9 +64,7 @@ jobs:
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt > log2.txt
-          output=$(awk '/Prompt:/ {found=1} found' log2.txt) || exit_code=$?
-          echo "$output"
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt
 
       - run: echo "This job's status is ${{ job.status }}."

From 4e23a4c02452e7dc90adb83e2096b17ab17ebeb7 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 11:32:43 +0800
Subject: [PATCH 15/17] split two test

---
 .github/workflows/install.yml | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 6b839340..ae3fc951 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -52,9 +52,7 @@ jobs:
           git submodule init
           git submodule update
           bash install.sh
-      - name: Test Local Chat
-        env:
-          TERM: xterm-256color
+      - name: Test Local Chat 1
         run: |
           set -e
           source /home/qujing3/anaconda3/etc/profile.d/conda.sh
@@ -65,6 +63,16 @@ jobs:
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt
+      - name: Test Local Chat 2
+        run: |
+          set -e
+          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
+          conda activate ktransformers-dev
+          export PATH=/usr/local/cuda-12.4/bin:$PATH
+          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
+          export CUDA_HOME=/usr/local/cuda-12.4
+          cd ${{ github.workspace }}
+          echo "Running Local Chat 2...(chinese.txt)"
           python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt
 
       - run: echo "This job's status is ${{ job.status }}."

From a1891b845d6739fc27cbb964184767e83c6cf3c8 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 17:04:42 +0800
Subject: [PATCH 16/17] remove unsupprted paramters, add force think

---
 .github/workflows/install.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index ae3fc951..4a0accce 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -62,7 +62,7 @@ jobs:
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
           echo "Running Local Chat 1...(book.txt)"
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt
       - name: Test Local Chat 2
         run: |
           set -e
@@ -72,7 +72,7 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
-          echo "Running Local Chat 2...(chinese.txt)"
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cache_len 1536 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt
+          echo "Running Local Chat 2 [force think]...(chinese.txt)"
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f
 
       - run: echo "This job's status is ${{ job.status }}."

From c66ca65778156b939b754f42b965ebc664a0d6f0 Mon Sep 17 00:00:00 2001
From: SkqLiao <skqliao@gmail.com>
Date: Sat, 15 Mar 2025 17:10:44 +0800
Subject: [PATCH 17/17] write to log

---
 .github/workflows/install.yml | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 4a0accce..3c2b9cf3 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -61,18 +61,11 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
           export CUDA_HOME=/usr/local/cuda-12.4
           cd ${{ github.workspace }}
-          echo "Running Local Chat 1...(book.txt)"
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt
-      - name: Test Local Chat 2
-        run: |
-          set -e
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          export PATH=/usr/local/cuda-12.4/bin:$PATH
-          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
-          export CUDA_HOME=/usr/local/cuda-12.4
-          cd ${{ github.workspace }}
-          echo "Running Local Chat 2 [force think]...(chinese.txt)"
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f
+          echo "Running Local Chat 1 (book.txt) ..."
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
+          sed -n '/Prompt:/,$p' log1.txt
+          echo "Running Local Chat 2 [force think] (chinese.txt) ..."
+          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f > log2.txt
+          sed -n '/Prompt:/,$p' log2.txt
 
       - run: echo "This job's status is ${{ job.status }}."