diff --git a/Makefile b/Makefile
index 1602eef..8349809 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,13 @@ dev_install:
 	echo "Installing ktransformers"
 	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
 	echo "Installation completed successfully"
+clean:
+	rm -rf build
+	rm -rf *.egg-info
+	rm -rf ktransformers/ktransformers_ext/build
+	rm -rf ktransformers/ktransformers_ext/cuda/build
+	rm -rf ktransformers/ktransformers_ext/cuda/dist
+	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
 install_numa:
 	USE_NUMA=1 make dev_install
 install_no_numa:
diff --git a/README.md b/README.md
index 7b539b4..76ad6eb 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 
 <h2 id="Updates">🔥 Updates</h2>
 
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 15, 2025**: Longer Context (from 4K to 8K for 24GB VRAM) & Slightly Faster Speed （+15%, up to 16 Tokens/s), update [docs](./doc/en/DeepseekR1_V3_tutorial.md) and [online books](https://kvcache-ai.github.io/ktransformers/).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. For detailed show case and reproduction tutorial, see [here](./doc/en/DeepseekR1_V3_tutorial.md).
diff --git a/doc/README.md b/doc/README.md
index f50e6b7..8bd94a0 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -22,6 +22,7 @@ Our vision for KTransformers is to serve as a flexible platform for experimentin
 
 <h2 id="Updates">🔥 Updates</h2>
 
+* **Mar 5, 2025**: Support unsloth 1.58/2.51 bits weights and [IQ1_S/FP8 hybrid](./doc/en/fp8_kernel.md) weights. Support 139K [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context) for DeepSeek-V3 and R1 in 24GB VRAM.
 * **Feb 25, 2025**: Support [FP8 GPU kernel](./doc/en/fp8_kernel.md) for DeepSeek-V3 and R1; [Longer Context](./doc/en/DeepseekR1_V3_tutorial.md#v022-longer-context).
 * **Feb 10, 2025**: Support Deepseek-R1 and V3 on single (24GB VRAM)/multi gpu and 382G DRAM, up to 3~28x speedup. The detailed tutorial is [here](./en/DeepseekR1_V3_tutorial.md).
 * **Aug 28, 2024**: Support 1M context under the InternLM2.5-7B-Chat-1M model, utilizing 24GB of VRAM and 150GB of DRAM. The detailed tutorial is [here](./en/long_context_tutorial.md).
diff --git a/doc/en/DeepseekR1_V3_tutorial.md b/doc/en/DeepseekR1_V3_tutorial.md
index 9c031dc..082078d 100644
--- a/doc/en/DeepseekR1_V3_tutorial.md
+++ b/doc/en/DeepseekR1_V3_tutorial.md
@@ -16,7 +16,7 @@
 			- [Memory consumptions:](#memory-consumptions)
 			- [Benchmark results](#benchmark-results-2)
 	- [How to Run](#how-to-run)
-		- [V0.2.2 longer context \& FP8 kernel](#v022-longer-context--fp8-kernel)
+		- [v0.2.2 \& v0.2.3 longer context \& FP8 kernel](#v022--v023-longer-context--fp8-kernel)
 			- [longer context](#longer-context)
 			- [FP8 kernel](#fp8-kernel)
 		- [V0.2 \& V0.2.1 Showcase](#v02--v021-showcase)
@@ -157,7 +157,7 @@ the output quality doesn't change. But the speed of decoding and prefill
 is speed up which is inspiring. So our showcase makes use of this finding*
 
 ## How to Run
-### V0.2.2 longer context & FP8 kernel
+### v0.2.2 & v0.2.3 longer context & FP8 kernel
 #### longer context
 To use this feature, [install flashinfer](https://github.com/flashinfer-ai/flashinfer) first.
 
diff --git a/doc/en/FAQ.md b/doc/en/FAQ.md
index a8399bd..e001153 100644
--- a/doc/en/FAQ.md
+++ b/doc/en/FAQ.md
@@ -1,4 +1,18 @@
+<!-- omit in toc -->
 # FAQ
+- [Install](#install)
+  - [Q: ImportError: /lib/x86\_64-linux-gnu/libstdc++.so.6: version GLIBCXX\_3.4.32' not found](#q-importerror-libx86_64-linux-gnulibstdcso6-version-glibcxx_3432-not-found)
+  - [Q: DeepSeek-R1 not outputting initial  token](#q-deepseek-r1-not-outputting-initial--token)
+- [Usage](#usage)
+  - [Q: If I got more VRAM than the model's requirement, how can I fully utilize it?](#q-if-i-got-more-vram-than-the-models-requirement-how-can-i-fully-utilize-it)
+  - [Q: If I don't have enough VRAM, but I have multiple GPUs, how can I utilize them?](#q-if-i-dont-have-enough-vram-but-i-have-multiple-gpus-how-can-i-utilize-them)
+  - [Q: How to get the best performance?](#q-how-to-get-the-best-performance)
+  - [Q: My DeepSeek-R1 model is not thinking.](#q-my-deepseek-r1-model-is-not-thinking)
+  - [Q: Loading gguf error](#q-loading-gguf-error)
+  - [Q: Version \`GLIBCXX\_3.4.30' not found](#q-version-glibcxx_3430-not-found)
+  - [Q: When running the bfloat16 moe model, the data shows NaN](#q-when-running-the-bfloat16-moe-model-the-data-shows-nan)
+  - [Q: Using fp8 prefill very slow.](#q-using-fp8-prefill-very-slow)
+  - [Q: Possible ways to run graphics cards using volta and turing architectures](#q-possible-ways-to-run-graphics-cards-using-volta-and-turing-architectures)
 ## Install
 ### Q: ImportError: /lib/x86_64-linux-gnu/libstdc++.so.6: version GLIBCXX_3.4.32' not found
 ```
@@ -96,4 +110,58 @@ RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
 
 ### Q: Using fp8 prefill very slow.
 
-The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
\ No newline at end of file
+The FP8 kernel is build by JIT, so the first run will be slow. The subsequent runs will be faster.
+
+### Q: Possible ways to run graphics cards using volta and turing architectures
+
+From: https://github.com/kvcache-ai/ktransformers/issues/374
+
+1. First, download the latest source code using git.
+2. Then, modify the DeepSeek-V3-Chat-multi-gpu-4.yaml in the source code and all related yaml files, replacing all instances of KLinearMarlin with KLinearTorch.
+3. Next, you need to compile from the ktransformer source code until it successfully compiles on your local machine.
+4. Then, install flash-attn. It won't be used, but not installing it will cause an error.
+5. Then, modify local_chat.py, replacing all instances of flash_attention_2 with eager.
+6. Then, run local_chat.py. Be sure to follow the official tutorial's commands and adjust according to your local machine's parameters.
+7. During the running process, check the memory usage. Observe its invocation through the top command. The memory capacity on a single CPU must be greater than the complete size of the model. (For multiple CPUs, it's just a copy.)
+Finally, confirm that the model is fully loaded into memory and specific weight layers are fully loaded into the GPU memory. Then, try to input content in the chat interface and observe if there are any errors.
+
+Attention, for better perfomance, you can check this [method](https://github.com/kvcache-ai/ktransformers/issues/374#issuecomment-2667520838) in the issue
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L274-L279
+>
+>```diff
+>+ original_dtype = query_states.dtype
+>+ target_dtype = torch.half
+>+ query_states = query_states.to(target_dtype)
+>+ compressed_kv_with_k_pe = compressed_kv_with_k_pe.to(target_dtype)
+>+ compressed_kv = compressed_kv.to(target_dtype)
+>+ attn_output = attn_output.to(target_dtype)
+>
+>decode_attention_fwd_grouped(query_states, compressed_kv_with_k_pe, compressed_kv, attn_output,
+>                             page_table,
+>                             position_ids.squeeze(0).to(torch.int32)+1, attn_logits,
+>                             4, #num_kv_splits # follow vLLM, fix it TODO
+>                             self.softmax_scale,
+>                             past_key_value.page_size)
+>
+>+ attn_output = attn_output.to(original_dtype)
+>```
+>
+>https://github.com/kvcache-ai/ktransformers/blob/89f8218a2ab7ff82fa54dbfe30df741c574317fc/ktransformers/operators/attention.py#L320-L326
+>
+>```diff
+>- attn_output = flash_attn_func( 
+>-     query_states, 
+>-     key_states, 
+>-     value_states_padded, 
+>-     softmax_scale=self.softmax_scale, 
+>-     causal=True, 
+>- )
+>+ attn_output = F.scaled_dot_product_attention(
+>+     query_states.transpose(1, 2),
+>+     key_states.transpose(1, 2),
+>+     value_states_padded.transpose(1, 2),
+>+     scale=self.softmax_scale,
+>+     is_causal=True
+>+ ).transpose(1, 2)
+>```
\ No newline at end of file
diff --git a/doc/en/benchmark.md b/doc/en/benchmark.md
index 599aef8..c498d4d 100644
--- a/doc/en/benchmark.md
+++ b/doc/en/benchmark.md
@@ -26,7 +26,7 @@ Given that we have only tested 1,000 cases, which provides only a preliminary ju
 
 
 ## The Result Table
-
+Uses DeepSeek-V3 model (Some specific cases are R1)
 |                          |                   |            |                   |         |            |                                                        |              |
 | ------------------------ | ----------------- | ---------- | ----------------- | ------- | ---------- | ------------------------------------------------------ | ------------ |
 | DataSet                  | CPU Weight Format | CPU Kernel | GPU Weight Format | GEMM Kernel   | MLA Kernel | [Siliconflow](https://cloud.siliconflow.cn/models)<br> | Ktrans Point |
@@ -37,9 +37,11 @@ Given that we have only tested 1,000 cases, which provides only a preliminary ju
 |              4            | q4km              | cpuinfer   | q4km->marlin 8    | marlin  | triton     | 81.6                                                   | 81.1         |
 |               5           | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 81.6                                                   | 81           |
 |                6          | q4km              | cpuinfer   | fp8               | fp8gemm  | triton     | 81.6                                                   | 81.5         |
-| MMLU-pro                 |               |    |                |  |      |                                                    |          |
+|                7 (DeepSeek-R1)          |  iq1             | cpuinfer   |     fp8           |  fp8gemm | triton     | 78.6                                                   | 83.6         |
+| MMLU-pro<br>(shuffle 1k)                 |               |    |                |  |      |                                                    |          |
 | 1                 | q4km              | cpuinfer   | fp8               | fp8gemm | triton     | 57.7                                                   | 57.6         |
 |  2             | q4km              | cpuinfer   | q4km->marlin 4    | marlin  | triton     | 57.7                                                   | 57.5         |
+|  3 (DeepSeek-R1)             | iq1              | cpuinfer   | fp8    | fp8gem  | triton     | 71.9                                                   | tbd         |
 | HumanEval                | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
 | GSM8K                    | tbd               | tbd        | tbd               | tbd     | tbd        | tbd                                                    | tbd          |
 
@@ -54,6 +56,8 @@ By default, The MLA kernel uses triton in linux and torch in windows. But we nee
   4. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). You don't need to change the source code as they both use q4km. But note the yaml file [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L29) and [here](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml#L18), below these lines you need to add `num_bits: 8` (in other words: add this kwargs to all that use `KLinearMarlin`). The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
   5. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
   6. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
+  7. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
 - MMLU-pro test
   1. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case. 
-  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
\ No newline at end of file
+  2. [v3-chat_yaml](https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml). No need to change yaml, just use the default. The weight file for q4km is [here](https://huggingface.co/unsloth/DeepSeek-V3-GGUF/tree/main/DeepSeek-V3-Q4_K_M)
+  3. You should check the [doc](./fp8_kernel.md) to learn how to test this case. This is a mixture tensor case.
\ No newline at end of file
diff --git a/doc/en/install.md b/doc/en/install.md
index 559c2b0..8752fe4 100644
--- a/doc/en/install.md
+++ b/doc/en/install.md
@@ -1,5 +1,11 @@
-
+<!-- omit in toc -->
 # How to Run DeepSeek-R1
+- [Preparation](#preparation)
+- [Installation](#installation)
+  - [Attention](#attention)
+  - [Supported models include:](#supported-models-include)
+  - [Support quantize format:](#support-quantize-format)
+
 In this document, we will show you how to install and run KTransformers on your local machine. There are two versions: 
 * V0.2 is the current main branch.
 * V0.3 is a preview version only provides binary distribution for now.
@@ -56,6 +62,8 @@ Some preparation:
  - At the same time, you should download and install the corresponding version of flash-attention from https://github.com/Dao-AILab/flash-attention/releases.
 
 ## Installation
+### Attention
+If you want to use numa support, not only do you need to set USE_NUMA=1, but you also need to make sure you have installed the libnuma-dev (`sudo apt-get install libnuma-dev` may help you).
 
 <!-- 1. ~~Use a Docker image, see [documentation for Docker](./doc/en/Docker.md)~~
    
diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py
index e952043..e6c2459 100644
--- a/ktransformers/__init__.py
+++ b/ktransformers/__init__.py
@@ -8,4 +8,4 @@ Version      : 1.0.0
 LastEditors  : chenxl 
 LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.2.2rc2"
+__version__ = "0.2.3"
\ No newline at end of file
diff --git a/ktransformers/server/backend/interfaces/transformers.py b/ktransformers/server/backend/interfaces/transformers.py
index d5d2857..7200ef4 100644
--- a/ktransformers/server/backend/interfaces/transformers.py
+++ b/ktransformers/server/backend/interfaces/transformers.py
@@ -329,15 +329,16 @@ class TransformersInterface(BackendInterfaceBase):
 
     @torch.no_grad
     def generate(self):
-        self.args.max_new_tokens = min(self.args.max_new_tokens, self.args.cache_lens - self.seq_length) 
-        if(self.args.max_new_tokens <= 0):
+        self.max_new_tokens = min(self.args.max_new_tokens, self.args.cache_lens - self.seq_length) - 1 
+        logger.info(f"args.max_new_tokens: {self.args.max_new_tokens}, cache_lens: {self.args.cache_lens}, seq_length: {self.seq_length}")
+        if(self.max_new_tokens <= 0):
             logger.warning("max_new_tokens is less than 0")
             yield self.streamer.end()
             return
-        logger.info(f"max_new_tokens: {self.args.max_new_tokens}")
+        logger.info(f"max_new_tokens: {self.max_new_tokens}")
         self.profiler.set_counter("decode", 0)
 
-        for i in range(1, self.args.max_new_tokens):
+        for i in range(1, self.max_new_tokens):
             with torch.nn.attention.sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
                 if flashinfer_enabled:
                     MLAWrapperSingleton.plan_all(None,None,None,self.active_cache_position.to(torch.int32)+1,
diff --git a/ktransformers/tests/.gitignore b/ktransformers/tests/.gitignore
new file mode 100644
index 0000000..68bcbc9
--- /dev/null
+++ b/ktransformers/tests/.gitignore
@@ -0,0 +1 @@
+results/
\ No newline at end of file
diff --git a/ktransformers/tests/AIME_2024/eval_api.py b/ktransformers/tests/AIME_2024/eval_api.py
new file mode 100644
index 0000000..b2ed347
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/eval_api.py
@@ -0,0 +1,133 @@
+# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
+import argparse
+import json
+import os
+import time
+import requests
+import tqdm
+
+from evaluation import filter_answer
+from prompts import instruct_prompt
+import pandas as pd
+from datasets import load_dataset
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+
+def generate_text(api_url,question , model_name, stream=False, auth_token=None):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
+    }
+    question = instruct_prompt(question)
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        "temperature": 0.6,
+        "max_tokens": 10240,
+    }
+    print(f"content: {question}")
+    response = requests.post(api_url, headers=headers, json=data,verify=False)
+    if response.status_code == 200:
+        result = response.json()
+        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
+        return filter_answer(results)
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+def load_data(file_path):
+        """
+        Load data from a Parquet file into a list.
+        Each record in the Parquet file should represent an individual record.
+        """
+        # 读取 Parquet 文件
+        # dataset = load_dataset('parquet', data_files=file_path)
+        data = []
+        ds = load_dataset(file_path)
+        df = pd.DataFrame(ds['train'])
+        for _, row in df.iterrows():
+            data.append(row.to_dict())
+        return data
+
+def get_score(pred, answer):
+        """
+        Calculate scores between the prediction and the answer.
+        Uses ROUGE scores as the evaluation metric.
+        :param pred: The predicted string.
+        :param answer: The reference answer string.
+        :return: A dictionary containing ROUGE scores.
+        """
+        if pred == answer:
+            return 1
+        # if we need to compare str with number, convert teh str to number
+        try:
+            pred = float(pred)
+            answer = float(answer)
+        except:
+            pass
+        if pred == answer:
+            return 1
+        return 0
+
+def run_eval_api(
+    api_url: str,
+    model_name: str,
+    out_path: str,
+    format_tabs: bool = False,
+    auth_token: str = None,
+    problem_file: str = None,
+    append: bool = False
+):
+  
+    data = load_data(problem_file)
+    pbar = tqdm.tqdm(total=len(data) * 1)
+
+    for i in range(len(data)):
+        data_item = data[i]
+        question = data_item['Problem']
+        # Start the timer for this evaluation
+        start_time = time.time()
+        try:
+            completion = generate_text(api_url, question, model_name, auth_token=auth_token)
+            if completion is None:
+                raise Exception(f"Failed to get prediction for {question}")
+            answer = data_item['Answer']
+            score = get_score(completion, answer)
+            elapsed_time = time.time() - start_time
+            result = {
+                "question_id": data_item["ID"],
+                "answer": answer,
+                "prediction": completion,
+                "score": score,
+                "time": elapsed_time
+            }
+            with open(out_path, "a" if append else "w") as f:
+                f.write(json.dumps(result) + "\n")
+            
+        except Exception as e:
+            print(f"Failed to get prediction for {question}")
+            print(e)
+            continue
+
+        pbar.update(1)
+    
+
+def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-R1", help="Model Name")
+    parser.add_argument("--out_path", type=str, default="results/api/eval_aime.jsonl", help="Output Path")
+    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
+    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
+    parser.add_argument("--problem_file", type=str, default="Maxwell-Jia/AIME_2024", help="Evalset File")
+    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
+    args = parser.parse_args()
+    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
+    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append)
\ No newline at end of file
diff --git a/ktransformers/tests/AIME_2024/evaluation.py b/ktransformers/tests/AIME_2024/evaluation.py
new file mode 100644
index 0000000..d43a25a
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/evaluation.py
@@ -0,0 +1,10 @@
+# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
+def filter_answer(completion: str) -> str:
+    # the answer is the last part of the completion, it's a int64 number
+    # get the last line
+    completion = completion.strip().split("\n")[-1]
+    # handle the $\\boxed{...}$ format
+    if "$\\boxed{" in completion:
+        return completion.split("}")[0].split("{")[-1]
+    return completion.split()[-1]
+
diff --git a/ktransformers/tests/AIME_2024/prompts.py b/ktransformers/tests/AIME_2024/prompts.py
new file mode 100644
index 0000000..ef75841
--- /dev/null
+++ b/ktransformers/tests/AIME_2024/prompts.py
@@ -0,0 +1,2 @@
+def instruct_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""
diff --git a/ktransformers/tests/humaneval/eval_api.py b/ktransformers/tests/humaneval/eval_api.py
new file mode 100644
index 0000000..e0365e1
--- /dev/null
+++ b/ktransformers/tests/humaneval/eval_api.py
@@ -0,0 +1,89 @@
+# adapt from https://github.com/abacaj/code-eval?tab=readme-ov-file
+import argparse
+import os
+import requests
+from human_eval.data import write_jsonl, read_problems
+import tqdm
+
+from evaluation import filter_code, fix_indents
+from prompts import instruct_prompt
+
+def generate_text(api_url,question , model_name, stream=False, auth_token=None):
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/json',
+        # 添加 API Key
+        'Authorization' : 'Bearer ' + auth_token if auth_token else ''
+    }
+    question = instruct_prompt(question)
+    data = {
+        "messages": [{"content": question, "role": "user"}],
+        "model": model_name,
+        "stream": stream,
+        "temperature": 0.6
+    }
+    print(f"content: {question}")
+    response = requests.post(api_url, headers=headers, json=data,verify=False)
+    if response.status_code == 200:
+        result = response.json()
+        results = result.get('choices', [{}])[0].get('message', {}).get('content', '')
+        return [filter_code(fix_indents(results))]
+    else:
+        print(f"API Request failed with status code {response.status_code}")
+        return None
+
+def run_eval_api(
+    api_url: str,
+    model_name: str,
+    out_path: str,
+    format_tabs: bool = False,
+    auth_token: str = None,
+    problem_file: str = None,
+    append: bool = False
+):
+    if(problem_file is None):
+        problems = read_problems()
+    else:
+        problems = read_problems(problem_file)
+    samples = []
+    pbar = tqdm.tqdm(total=len(problems) * 1)
+    try:
+        for task_id in problems:
+            if format_tabs:
+                prompt = problems[task_id]["prompt"].replace("    ", "\t")
+            else:
+                prompt = problems[task_id]["prompt"]
+            completion = generate_text(api_url, prompt, model_name, auth_token=auth_token)
+            # samples.append({"task_id": task_id, "completion": completion})
+            for sample in completion:
+                result = dict(
+                    task_id=task_id,
+                    completion=sample,
+                )
+                samples += [result]
+                if append:
+                    write_jsonl(out_path, [result],append=append)
+            pbar.update(1)
+        if not append:
+            write_jsonl(out_path, samples,append=append)
+    except Exception as e:
+        write_jsonl(out_path, samples,append=append)
+        print(f"Error: {e}")
+
+def main(output_path, api_url, model_name, auth_token, format_tabs,problem_file, append):
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    run_eval_api(api_url, model_name, output_path, format_tabs, auth_token, problem_file,append)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="API Generate Tester")
+    parser.add_argument("--api_url", type=str, default="https://api.siliconflow.cn/v1/chat/completions", help="API URL")
+    parser.add_argument("--model_name", type=str, default="Pro/deepseek-ai/DeepSeek-V3", help="Model Name")
+    parser.add_argument("--out_path", type=str, default="results/api/eval.jsonl", help="Output Path")
+    parser.add_argument("--auth_token", type=str, default=None, help="Auth Token")
+    parser.add_argument("--format_tabs", action="store_true", help="Format Tabs")
+    parser.add_argument("--problem_file", type=str, default=None, help="Evalset File")
+    parser.add_argument("--no_append", action="store_false", help="Append to existing file")
+    args = parser.parse_args()
+    # api_url = "https://api.siliconflow.cn/v1/chat/completions"
+    main(args.out_path, args.api_url, args.model_name, args.auth_token, args.format_tabs, args.problem_file, args.no_append)
\ No newline at end of file
diff --git a/ktransformers/tests/humaneval/evaluation.py b/ktransformers/tests/humaneval/evaluation.py
new file mode 100644
index 0000000..9d93a20
--- /dev/null
+++ b/ktransformers/tests/humaneval/evaluation.py
@@ -0,0 +1,15 @@
+# reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
+def filter_code(completion: str) -> str:
+    # The program tends to overwrite, we only take the first function
+    completion = completion.lstrip("\n")
+    # we also remove ```python\n and ```
+    completion = completion.replace("```python\n", "").replace("```", "")
+    if 'if __name__ == "__main__":' in completion:
+        completion = completion.split('if __name__ == "__main__":')[0]
+    if "# Example usage" in completion:
+        completion = completion.split("# Example usage")[0]
+    return completion.split("\n\n")[0]
+
+
+def fix_indents(text: str) -> str:
+    return text.replace("\t", "    ")
diff --git a/ktransformers/tests/humaneval/prompts.py b/ktransformers/tests/humaneval/prompts.py
new file mode 100644
index 0000000..694bac8
--- /dev/null
+++ b/ktransformers/tests/humaneval/prompts.py
@@ -0,0 +1,14 @@
+def instruct_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
+
+
+def standard_prompt(prompt: str) -> str:
+    return f"""Complete the following Python code without any tests or explanation\n{prompt}"""
+
+
+def write_prompt(prompt: str) -> str:
+    return f"""Write a python program to complete the following code:\n{prompt}"""
+
+
+def replit_glaive_prompt(prompt: str) -> str:
+    return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""
diff --git a/ktransformers/tests/mmlu_pro_test.py b/ktransformers/tests/mmlu_pro_test.py
index dffe296..c0db4a5 100644
--- a/ktransformers/tests/mmlu_pro_test.py
+++ b/ktransformers/tests/mmlu_pro_test.py
@@ -59,8 +59,8 @@ class DataEvaluator:
         :param text: The raw prediction string.
         :return: Processed prediction string.
         """
-        text = text.lstrip('\n').split('\n')[0]
-        return text[:1]
+        text = text.lstrip('\n').split('\n')[-1]
+        return text[-1:]
 
     def score(self, pred, answers):
         """
diff --git a/ktransformers/tests/mmlu_test.py b/ktransformers/tests/mmlu_test.py
index 296d81a..452cbbf 100644
--- a/ktransformers/tests/mmlu_test.py
+++ b/ktransformers/tests/mmlu_test.py
@@ -59,8 +59,8 @@ class DataEvaluator:
         :param text: The raw prediction string.
         :return: Processed prediction string.
         """
-        text = text.lstrip('\n').split('\n')[0]
-        return text[:1]
+        text = text.lstrip('\n').split('\n')[-1]
+        return text[-1:]
 
     def score(self, pred, answers):
         """
diff --git a/third_party/llamafile/iqk_mul_mat.inc b/third_party/llamafile/iqk_mul_mat.inc
index 5e9d688..6583f22 100644
--- a/third_party/llamafile/iqk_mul_mat.inc
+++ b/third_party/llamafile/iqk_mul_mat.inc
@@ -69,6 +69,10 @@
 
 #endif
 
+constexpr ggml_type GGML_TYPE_Q8_0_X4 = static_cast<ggml_type>(98);
+constexpr ggml_type GGML_TYPE_Q8_1_X4 = static_cast<ggml_type>(99);
+
+
 namespace {
 
 typedef struct {
@@ -106,13 +110,36 @@ struct DataInfo {
     }
 };
 
+/*
+moonll 
+change param for set_mul_mat 
+add func16
+*/
+
 typedef void (*mul_mat_t)(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x);
 
 struct MulMat {
     std::array<mul_mat_t, 8> funcs = {};
+    mul_mat_t func16 = nullptr;
     //inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
     IQK_NOINLINE void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
         constexpr int k_x_step = 64; // This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
+
+        if (func16 && nrc_y >= 16) {
+            int n_step = (nrc_y - info.cur_y)/16;
+            for (int ix = 0; ix < nrc_x; ix += k_x_step) {
+                auto this_info = info;
+                this_info.s += ix;
+                int this_nrc_x = ix + k_x_step <= nrc_x ? k_x_step : nrc_x - ix;
+                for (int iy = 0; iy < n_step; ++iy) {
+                    func16(n, (const void *)((const char *)vx + ix*bx), bx, this_info, this_nrc_x);
+                    this_info.cur_y += 16;
+                }
+            }
+            info.cur_y += 16 * n_step;
+            if (info.cur_y == nrc_y) return;
+        }
+
         int n_step = (nrc_y - info.cur_y)/funcs.size();
         if (n_step > 0) {
             for (int ix = 0; ix < nrc_x; ix += k_x_step) {
@@ -131,7 +158,7 @@ struct MulMat {
             funcs[n_left-1](n, vx, bx, info, nrc_x);
         }
     }
-    static IQK_NOINLINE bool set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int Ny);
+    static IQK_NOINLINE bool set_mul_mat(int typeA, int typeB,int ne00, MulMat& mm, int Ny);
 private:
     template <typename Dequantizer> static IQK_NOINLINE void set_functions(MulMat& m);
 };
@@ -147,6 +174,787 @@ inline void make_q4_scales(const uint8_t * scales8, uint32_t * aux32) {
     aux32[0] = a0 & 0x3f3f3f3f;
 }
 
+/*
+moonll
+decoding tables
+*/
+#ifdef __AVX2__
+static const uint64_t iq1s_grid_us[2048] = {
+    0x0000000000000000, 0x0000000000000002, 0x0000000000000101, 0x0000000000000200,
+    0x0000000000000202, 0x0000000000010001, 0x0000000000010101, 0x0000000000020000,
+    0x0000000000020002, 0x0000000000020200, 0x0000000000020202, 0x0000000001000101,
+    0x0000000001010001, 0x0000000001010100, 0x0000000001010102, 0x0000000001020101,
+    0x0000000002000000, 0x0000000002000002, 0x0000000002000200, 0x0000000002000202,
+    0x0000000002010101, 0x0000000002020000, 0x0000000002020002, 0x0000000002020200,
+    0x0000000002020202, 0x0000000100000100, 0x0000000100000101, 0x0000000100010001,
+    0x0000000100010100, 0x0000000100010102, 0x0000000100010201, 0x0000000100010202,
+    0x0000000100020101, 0x0000000101000001, 0x0000000101000102, 0x0000000101000201,
+    0x0000000101010002, 0x0000000101010101, 0x0000000101010202, 0x0000000101020001,
+    0x0000000101020100, 0x0000000101020102, 0x0000000101020200, 0x0000000102000101,
+    0x0000000102010001, 0x0000000102010100, 0x0000000102010102, 0x0000000102020101,
+    0x0000000200000000, 0x0000000200000002, 0x0000000200000200, 0x0000000200000202,
+    0x0000000200010101, 0x0000000200020000, 0x0000000200020002, 0x0000000200020200,
+    0x0000000200020202, 0x0000000201000101, 0x0000000201010001, 0x0000000201010201,
+    0x0000000201020100, 0x0000000201020201, 0x0000000202000000, 0x0000000202000002,
+    0x0000000202000200, 0x0000000202000202, 0x0000000202010001, 0x0000000202010101,
+    0x0000000202010201, 0x0000000202020000, 0x0000000202020002, 0x0000000202020200,
+    0x0000000202020202, 0x0000010000010001, 0x0000010000010100, 0x0000010000010102,
+    0x0000010000020101, 0x0000010001000001, 0x0000010001000201, 0x0000010001010101,
+    0x0000010001010202, 0x0000010001020100, 0x0000010001020101, 0x0000010002010001,
+    0x0000010002010201, 0x0000010002020101, 0x0000010100000001, 0x0000010100000100,
+    0x0000010100000101, 0x0000010100000102, 0x0000010100010101, 0x0000010100010200,
+    0x0000010100010202, 0x0000010100020201, 0x0000010101000000, 0x0000010101000101,
+    0x0000010101000202, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100,
+    0x0000010101010101, 0x0000010101010102, 0x0000010101010201, 0x0000010101020000,
+    0x0000010101020002, 0x0000010101020101, 0x0000010101020200, 0x0000010101020202,
+    0x0000010102000001, 0x0000010102010001, 0x0000010102010101, 0x0000010102010200,
+    0x0000010102010202, 0x0000010102020001, 0x0000010102020100, 0x0000010102020101,
+    0x0000010102020102, 0x0000010102020201, 0x0000010200010100, 0x0000010200010201,
+    0x0000010201000001, 0x0000010201000100, 0x0000010201010000, 0x0000010201010002,
+    0x0000010201010101, 0x0000010201010200, 0x0000010201020000, 0x0000010201020001,
+    0x0000010201020102, 0x0000010201020201, 0x0000010202000101, 0x0000010202010001,
+    0x0000010202010100, 0x0000010202010201, 0x0000020000000000, 0x0000020000000002,
+    0x0000020000000200, 0x0000020000000202, 0x0000020000010101, 0x0000020000020000,
+    0x0000020000020002, 0x0000020000020200, 0x0000020000020202, 0x0000020001000101,
+    0x0000020001010001, 0x0000020001010102, 0x0000020001020101, 0x0000020002000000,
+    0x0000020002000002, 0x0000020002000200, 0x0000020002000202, 0x0000020002010101,
+    0x0000020002020000, 0x0000020002020002, 0x0000020002020200, 0x0000020002020202,
+    0x0000020100000101, 0x0000020100010001, 0x0000020100010100, 0x0000020100010201,
+    0x0000020100020100, 0x0000020100020101, 0x0000020101000001, 0x0000020101010000,
+    0x0000020101010001, 0x0000020101010101, 0x0000020101020001, 0x0000020101020100,
+    0x0000020101020201, 0x0000020102010001, 0x0000020102010100, 0x0000020102010102,
+    0x0000020102010201, 0x0000020102020101, 0x0000020200000000, 0x0000020200000002,
+    0x0000020200000200, 0x0000020200000202, 0x0000020200010101, 0x0000020200020000,
+    0x0000020200020002, 0x0000020200020200, 0x0000020200020202, 0x0000020201000101,
+    0x0000020201010001, 0x0000020201010201, 0x0000020201020001, 0x0000020201020101,
+    0x0000020202000000, 0x0000020202000002, 0x0000020202000101, 0x0000020202000200,
+    0x0000020202000202, 0x0000020202010101, 0x0000020202020000, 0x0000020202020002,
+    0x0000020202020200, 0x0000020202020202, 0x0001000000010000, 0x0001000000010001,
+    0x0001000000010100, 0x0001000000010201, 0x0001000000020100, 0x0001000000020101,
+    0x0001000001000001, 0x0001000001000100, 0x0001000001010000, 0x0001000001010101,
+    0x0001000001010200, 0x0001000001020001, 0x0001000001020100, 0x0001000001020101,
+    0x0001000001020201, 0x0001000002010001, 0x0001000002010100, 0x0001000002010102,
+    0x0001000002020001, 0x0001000002020101, 0x0001000100000001, 0x0001000100000100,
+    0x0001000100000102, 0x0001000100000201, 0x0001000100010000, 0x0001000100010002,
+    0x0001000100010101, 0x0001000100010200, 0x0001000100020001, 0x0001000100020100,
+    0x0001000100020201, 0x0001000101000101, 0x0001000101000202, 0x0001000101010000,
+    0x0001000101010001, 0x0001000101010002, 0x0001000101010100, 0x0001000101010101,
+    0x0001000101010102, 0x0001000101010201, 0x0001000101020000, 0x0001000101020101,
+    0x0001000102000100, 0x0001000102010002, 0x0001000102010101, 0x0001000102020001,
+    0x0001000102020100, 0x0001000200010001, 0x0001000200010100, 0x0001000200010102,
+    0x0001000200020101, 0x0001000201000000, 0x0001000201000102, 0x0001000201000201,
+    0x0001000201010002, 0x0001000201010101, 0x0001000201010200, 0x0001000201010202,
+    0x0001000201020100, 0x0001000201020102, 0x0001000202000101, 0x0001000202010001,
+    0x0001000202010100, 0x0001000202010102, 0x0001000202020101, 0x0001010000000001,
+    0x0001010000000102, 0x0001010000000201, 0x0001010000010100, 0x0001010000010101,
+    0x0001010000010200, 0x0001010000010201, 0x0001010000020001, 0x0001010000020102,
+    0x0001010001000001, 0x0001010001000101, 0x0001010001000102, 0x0001010001000200,
+    0x0001010001000202, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101,
+    0x0001010001010102, 0x0001010001010201, 0x0001010001020002, 0x0001010001020101,
+    0x0001010001020200, 0x0001010002000100, 0x0001010002000201, 0x0001010002010000,
+    0x0001010002010100, 0x0001010002010101, 0x0001010002010200, 0x0001010002010201,
+    0x0001010002010202, 0x0001010002020001, 0x0001010002020100, 0x0001010002020101,
+    0x0001010002020201, 0x0001010100000002, 0x0001010100000101, 0x0001010100000202,
+    0x0001010100010001, 0x0001010100010100, 0x0001010100010101, 0x0001010100010102,
+    0x0001010100010201, 0x0001010100020000, 0x0001010100020002, 0x0001010100020101,
+    0x0001010100020200, 0x0001010100020202, 0x0001010101000001, 0x0001010101000100,
+    0x0001010101000101, 0x0001010101000102, 0x0001010101010001, 0x0001010101010002,
+    0x0001010101010100, 0x0001010101010101, 0x0001010101010102, 0x0001010101010201,
+    0x0001010101010202, 0x0001010101020001, 0x0001010101020100, 0x0001010101020101,
+    0x0001010101020102, 0x0001010101020201, 0x0001010102000000, 0x0001010102000002,
+    0x0001010102000100, 0x0001010102000101, 0x0001010102000200, 0x0001010102000202,
+    0x0001010102010000, 0x0001010102010001, 0x0001010102010100, 0x0001010102010101,
+    0x0001010102010102, 0x0001010102010201, 0x0001010102010202, 0x0001010102020000,
+    0x0001010102020002, 0x0001010102020101, 0x0001010200000001, 0x0001010200000100,
+    0x0001010200000101, 0x0001010200000102, 0x0001010200010101, 0x0001010200010102,
+    0x0001010200010200, 0x0001010200010202, 0x0001010200020001, 0x0001010200020102,
+    0x0001010201000000, 0x0001010201000002, 0x0001010201000100, 0x0001010201000101,
+    0x0001010201000200, 0x0001010201000202, 0x0001010201010001, 0x0001010201010101,
+    0x0001010201010102, 0x0001010201010200, 0x0001010201010201, 0x0001010201020001,
+    0x0001010201020100, 0x0001010201020101, 0x0001010201020200, 0x0001010201020201,
+    0x0001010201020202, 0x0001010202000102, 0x0001010202000202, 0x0001010202010002,
+    0x0001010202010101, 0x0001010202020100, 0x0001010202020201, 0x0001020000010001,
+    0x0001020000010102, 0x0001020000020101, 0x0001020001000001, 0x0001020001000100,
+    0x0001020001000102, 0x0001020001000201, 0x0001020001010000, 0x0001020001010101,
+    0x0001020001010200, 0x0001020001010202, 0x0001020001020000, 0x0001020001020001,
+    0x0001020001020100, 0x0001020001020102, 0x0001020001020201, 0x0001020002000101,
+    0x0001020002010001, 0x0001020002010100, 0x0001020002020101, 0x0001020100010000,
+    0x0001020100010002, 0x0001020100010101, 0x0001020100010202, 0x0001020100020001,
+    0x0001020100020101, 0x0001020101000002, 0x0001020101000100, 0x0001020101000101,
+    0x0001020101000200, 0x0001020101010001, 0x0001020101010100, 0x0001020101010101,
+    0x0001020101010102, 0x0001020101010201, 0x0001020101010202, 0x0001020101020000,
+    0x0001020101020101, 0x0001020101020202, 0x0001020102000201, 0x0001020102010001,
+    0x0001020102010002, 0x0001020102010101, 0x0001020102010200, 0x0001020102020001,
+    0x0001020102020102, 0x0001020102020201, 0x0001020200000201, 0x0001020200010102,
+    0x0001020200020100, 0x0001020200020102, 0x0001020201000100, 0x0001020201000102,
+    0x0001020201000201, 0x0001020201010000, 0x0001020201010002, 0x0001020201010101,
+    0x0001020201010200, 0x0001020201020001, 0x0001020201020102, 0x0001020201020201,
+    0x0001020202000101, 0x0001020202010001, 0x0001020202010102, 0x0001020202010202,
+    0x0002000000000000, 0x0002000000000002, 0x0002000000000200, 0x0002000000000202,
+    0x0002000000010101, 0x0002000000020000, 0x0002000000020002, 0x0002000000020101,
+    0x0002000000020200, 0x0002000000020202, 0x0002000001000101, 0x0002000001010001,
+    0x0002000001010201, 0x0002000001020001, 0x0002000001020101, 0x0002000002000000,
+    0x0002000002000002, 0x0002000002000200, 0x0002000002000202, 0x0002000002010101,
+    0x0002000002020000, 0x0002000002020002, 0x0002000002020101, 0x0002000002020200,
+    0x0002000002020202, 0x0002000100000101, 0x0002000100010001, 0x0002000100010100,
+    0x0002000100010201, 0x0002000100020101, 0x0002000101000002, 0x0002000101000100,
+    0x0002000101000201, 0x0002000101010101, 0x0002000101010200, 0x0002000101010202,
+    0x0002000101020001, 0x0002000101020100, 0x0002000101020101, 0x0002000101020102,
+    0x0002000102000101, 0x0002000102010000, 0x0002000102010102, 0x0002000102010201,
+    0x0002000102020101, 0x0002000200000001, 0x0002000200000200, 0x0002000200000202,
+    0x0002000200010001, 0x0002000200010101, 0x0002000200020000, 0x0002000200020002,
+    0x0002000200020200, 0x0002000200020202, 0x0002000201000101, 0x0002000201010001,
+    0x0002000201010102, 0x0002000201010201, 0x0002000201020101, 0x0002000202000001,
+    0x0002000202000200, 0x0002000202000202, 0x0002000202010001, 0x0002000202010101,
+    0x0002000202020000, 0x0002000202020002, 0x0002000202020200, 0x0002000202020202,
+    0x0002010000000101, 0x0002010000010100, 0x0002010000010102, 0x0002010000010201,
+    0x0002010000020101, 0x0002010001000100, 0x0002010001000101, 0x0002010001000102,
+    0x0002010001000201, 0x0002010001010002, 0x0002010001010101, 0x0002010001010200,
+    0x0002010001010202, 0x0002010001020102, 0x0002010002000101, 0x0002010002010001,
+    0x0002010002010100, 0x0002010002010201, 0x0002010002020001, 0x0002010002020101,
+    0x0002010100000201, 0x0002010100010101, 0x0002010100020001, 0x0002010100020201,
+    0x0002010101000000, 0x0002010101000101, 0x0002010101000200, 0x0002010101010001,
+    0x0002010101010100, 0x0002010101010101, 0x0002010101010201, 0x0002010101020002,
+    0x0002010101020101, 0x0002010101020200, 0x0002010102000201, 0x0002010102010000,
+    0x0002010102010100, 0x0002010102010101, 0x0002010102010200, 0x0002010102010202,
+    0x0002010102020001, 0x0002010102020100, 0x0002010102020102, 0x0002010102020201,
+    0x0002010200000101, 0x0002010200010000, 0x0002010200010002, 0x0002010200010201,
+    0x0002010200020101, 0x0002010201000001, 0x0002010201000201, 0x0002010201010101,
+    0x0002010201020000, 0x0002010201020001, 0x0002010201020201, 0x0002010202000100,
+    0x0002010202000102, 0x0002010202010000, 0x0002010202010202, 0x0002020000000000,
+    0x0002020000000002, 0x0002020000000200, 0x0002020000000202, 0x0002020000010101,
+    0x0002020000020000, 0x0002020000020002, 0x0002020000020200, 0x0002020000020202,
+    0x0002020001000101, 0x0002020001010001, 0x0002020001010100, 0x0002020001020101,
+    0x0002020002000000, 0x0002020002000002, 0x0002020002000200, 0x0002020002000202,
+    0x0002020002020000, 0x0002020002020002, 0x0002020002020200, 0x0002020002020202,
+    0x0002020100000201, 0x0002020100010001, 0x0002020100010100, 0x0002020100010201,
+    0x0002020100020101, 0x0002020101000102, 0x0002020101000201, 0x0002020101010002,
+    0x0002020101010101, 0x0002020101020001, 0x0002020101020100, 0x0002020101020102,
+    0x0002020101020201, 0x0002020102000101, 0x0002020102010000, 0x0002020102010102,
+    0x0002020102010201, 0x0002020102020100, 0x0002020102020101, 0x0002020200000000,
+    0x0002020200000002, 0x0002020200000200, 0x0002020200000202, 0x0002020200020000,
+    0x0002020200020002, 0x0002020200020200, 0x0002020200020202, 0x0002020201000101,
+    0x0002020201010001, 0x0002020201010102, 0x0002020201010201, 0x0002020201020101,
+    0x0002020202000000, 0x0002020202000002, 0x0002020202000200, 0x0002020202000202,
+    0x0002020202010101, 0x0002020202020000, 0x0002020202020002, 0x0002020202020200,
+    0x0002020202020202, 0x0100000000000101, 0x0100000000010001, 0x0100000000010102,
+    0x0100000000020101, 0x0100000001000201, 0x0100000001010002, 0x0100000001010101,
+    0x0100000001010200, 0x0100000001010202, 0x0100000001020001, 0x0100000001020100,
+    0x0100000001020102, 0x0100000002010100, 0x0100000002010201, 0x0100000002020001,
+    0x0100000002020102, 0x0100000100000000, 0x0100000100000001, 0x0100000100000100,
+    0x0100000100000102, 0x0100000100000201, 0x0100000100010002, 0x0100000100010101,
+    0x0100000100010102, 0x0100000100010200, 0x0100000100010202, 0x0100000100020001,
+    0x0100000100020102, 0x0100000100020201, 0x0100000101000101, 0x0100000101000200,
+    0x0100000101000202, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101,
+    0x0100000101010102, 0x0100000101010201, 0x0100000101010202, 0x0100000101020101,
+    0x0100000101020200, 0x0100000101020202, 0x0100000102000001, 0x0100000102000100,
+    0x0100000102000102, 0x0100000102010000, 0x0100000102010002, 0x0100000102010101,
+    0x0100000102020000, 0x0100000102020001, 0x0100000102020002, 0x0100000200000101,
+    0x0100000200010001, 0x0100000200010100, 0x0100000200010102, 0x0100000200020101,
+    0x0100000201000001, 0x0100000201010002, 0x0100000201010101, 0x0100000201010202,
+    0x0100000201020100, 0x0100000201020201, 0x0100000202000201, 0x0100000202010100,
+    0x0100000202020101, 0x0100010000000001, 0x0100010000010101, 0x0100010000010201,
+    0x0100010000020201, 0x0100010001000101, 0x0100010001000200, 0x0100010001000202,
+    0x0100010001010001, 0x0100010001010100, 0x0100010001010101, 0x0100010001010102,
+    0x0100010001020001, 0x0100010001020002, 0x0100010001020101, 0x0100010001020200,
+    0x0100010001020202, 0x0100010002000001, 0x0100010002000102, 0x0100010002000201,
+    0x0100010002010000, 0x0100010002010002, 0x0100010002010101, 0x0100010002020000,
+    0x0100010002020001, 0x0100010002020201, 0x0100010100000001, 0x0100010100000002,
+    0x0100010100000101, 0x0100010100000202, 0x0100010100010001, 0x0100010100010100,
+    0x0100010100010101, 0x0100010100010102, 0x0100010100010201, 0x0100010100020000,
+    0x0100010100020101, 0x0100010100020202, 0x0100010101000001, 0x0100010101000100,
+    0x0100010101000101, 0x0100010101000102, 0x0100010101000201, 0x0100010101010000,
+    0x0100010101010001, 0x0100010101010100, 0x0100010101010101, 0x0100010101010102,
+    0x0100010101010200, 0x0100010101010201, 0x0100010101020001, 0x0100010101020100,
+    0x0100010101020101, 0x0100010101020102, 0x0100010101020201, 0x0100010102000002,
+    0x0100010102000100, 0x0100010102000101, 0x0100010102000200, 0x0100010102010001,
+    0x0100010102010100, 0x0100010102010101, 0x0100010102010102, 0x0100010102010201,
+    0x0100010102010202, 0x0100010102020101, 0x0100010102020200, 0x0100010102020202,
+    0x0100010200000001, 0x0100010200000101, 0x0100010200000201, 0x0100010200010100,
+    0x0100010200010101, 0x0100010200010200, 0x0100010200010202, 0x0100010200020001,
+    0x0100010200020100, 0x0100010200020201, 0x0100010201000000, 0x0100010201000002,
+    0x0100010201000101, 0x0100010201000200, 0x0100010201010000, 0x0100010201010001,
+    0x0100010201010002, 0x0100010201010101, 0x0100010201010102, 0x0100010201010201,
+    0x0100010201020002, 0x0100010201020101, 0x0100010201020200, 0x0100010202000001,
+    0x0100010202000101, 0x0100010202000202, 0x0100010202010100, 0x0100010202010101,
+    0x0100010202020001, 0x0100010202020100, 0x0100010202020102, 0x0100020000000101,
+    0x0100020000010001, 0x0100020000010101, 0x0100020000010202, 0x0100020000020101,
+    0x0100020001000002, 0x0100020001000201, 0x0100020001010000, 0x0100020001010101,
+    0x0100020001010200, 0x0100020001020001, 0x0100020001020100, 0x0100020001020102,
+    0x0100020001020201, 0x0100020002000101, 0x0100020002010001, 0x0100020002010100,
+    0x0100020002010102, 0x0100020002010201, 0x0100020002020101, 0x0100020100000001,
+    0x0100020100000101, 0x0100020100000102, 0x0100020100000202, 0x0100020100010000,
+    0x0100020100010100, 0x0100020100010101, 0x0100020100010200, 0x0100020100020001,
+    0x0100020100020100, 0x0100020100020102, 0x0100020101000000, 0x0100020101000101,
+    0x0100020101000202, 0x0100020101010001, 0x0100020101010002, 0x0100020101010100,
+    0x0100020101010101, 0x0100020101010102, 0x0100020101010201, 0x0100020101020000,
+    0x0100020101020002, 0x0100020101020101, 0x0100020101020102, 0x0100020101020202,
+    0x0100020102000102, 0x0100020102000201, 0x0100020102010002, 0x0100020102010101,
+    0x0100020102010102, 0x0100020102010200, 0x0100020102020001, 0x0100020102020100,
+    0x0100020102020102, 0x0100020102020201, 0x0100020200010102, 0x0100020201000100,
+    0x0100020201000102, 0x0100020201000201, 0x0100020201010101, 0x0100020201010200,
+    0x0100020201010202, 0x0100020201020100, 0x0100020201020201, 0x0100020202010100,
+    0x0100020202020101, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101,
+    0x0101000000000102, 0x0101000000000201, 0x0101000000010002, 0x0101000000010101,
+    0x0101000000010202, 0x0101000000020001, 0x0101000000020100, 0x0101000000020201,
+    0x0101000001000000, 0x0101000001000101, 0x0101000001000200, 0x0101000001010001,
+    0x0101000001010100, 0x0101000001010101, 0x0101000001010102, 0x0101000001010201,
+    0x0101000001020101, 0x0101000001020200, 0x0101000002000102, 0x0101000002000201,
+    0x0101000002010101, 0x0101000002010200, 0x0101000002020000, 0x0101000002020001,
+    0x0101000002020102, 0x0101000002020201, 0x0101000100000101, 0x0101000100000200,
+    0x0101000100000201, 0x0101000100000202, 0x0101000100010001, 0x0101000100010100,
+    0x0101000100010101, 0x0101000100010102, 0x0101000100010200, 0x0101000100010201,
+    0x0101000100020000, 0x0101000100020101, 0x0101000100020102, 0x0101000100020200,
+    0x0101000100020202, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101,
+    0x0101000101000102, 0x0101000101000201, 0x0101000101010000, 0x0101000101010001,
+    0x0101000101010002, 0x0101000101010100, 0x0101000101010101, 0x0101000101010102,
+    0x0101000101010200, 0x0101000101010201, 0x0101000101010202, 0x0101000101020001,
+    0x0101000101020100, 0x0101000101020101, 0x0101000101020102, 0x0101000101020201,
+    0x0101000102000002, 0x0101000102000101, 0x0101000102010001, 0x0101000102010100,
+    0x0101000102010101, 0x0101000102010102, 0x0101000102010201, 0x0101000102020000,
+    0x0101000102020101, 0x0101000102020202, 0x0101000200000001, 0x0101000200000102,
+    0x0101000200010002, 0x0101000200010101, 0x0101000200010202, 0x0101000200020001,
+    0x0101000200020100, 0x0101000201000002, 0x0101000201000101, 0x0101000201000202,
+    0x0101000201010001, 0x0101000201010100, 0x0101000201010101, 0x0101000201010102,
+    0x0101000201010201, 0x0101000201020002, 0x0101000201020101, 0x0101000202000101,
+    0x0101000202010000, 0x0101000202010002, 0x0101000202010101, 0x0101000202010201,
+    0x0101000202010202, 0x0101000202020100, 0x0101010000000100, 0x0101010000000101,
+    0x0101010000010001, 0x0101010000010100, 0x0101010000010101, 0x0101010000010102,
+    0x0101010000010200, 0x0101010000010201, 0x0101010000020001, 0x0101010000020101,
+    0x0101010000020200, 0x0101010000020202, 0x0101010001000001, 0x0101010001000100,
+    0x0101010001000101, 0x0101010001000102, 0x0101010001000201, 0x0101010001000202,
+    0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101,
+    0x0101010001010102, 0x0101010001010200, 0x0101010001010201, 0x0101010001010202,
+    0x0101010001020001, 0x0101010001020002, 0x0101010001020100, 0x0101010001020101,
+    0x0101010001020102, 0x0101010001020201, 0x0101010002000000, 0x0101010002000200,
+    0x0101010002000202, 0x0101010002010001, 0x0101010002010100, 0x0101010002010101,
+    0x0101010002010102, 0x0101010002010201, 0x0101010002020001, 0x0101010002020100,
+    0x0101010002020101, 0x0101010002020202, 0x0101010100000001, 0x0101010100000002,
+    0x0101010100000100, 0x0101010100000101, 0x0101010100000102, 0x0101010100000201,
+    0x0101010100010000, 0x0101010100010001, 0x0101010100010002, 0x0101010100010100,
+    0x0101010100010101, 0x0101010100010102, 0x0101010100010201, 0x0101010100010202,
+    0x0101010100020001, 0x0101010100020100, 0x0101010100020101, 0x0101010100020102,
+    0x0101010100020201, 0x0101010101000000, 0x0101010101000001, 0x0101010101000002,
+    0x0101010101000100, 0x0101010101000101, 0x0101010101000102, 0x0101010101000200,
+    0x0101010101000201, 0x0101010101010000, 0x0101010101010001, 0x0101010101010002,
+    0x0101010101010100, 0x0101010101010101, 0x0101010101010102, 0x0101010101010200,
+    0x0101010101010201, 0x0101010101010202, 0x0101010101020000, 0x0101010101020001,
+    0x0101010101020100, 0x0101010101020101, 0x0101010101020102, 0x0101010101020200,
+    0x0101010101020201, 0x0101010101020202, 0x0101010102000001, 0x0101010102000100,
+    0x0101010102000101, 0x0101010102000201, 0x0101010102000202, 0x0101010102010000,
+    0x0101010102010001, 0x0101010102010100, 0x0101010102010101, 0x0101010102010102,
+    0x0101010102010200, 0x0101010102010201, 0x0101010102020001, 0x0101010102020100,
+    0x0101010102020101, 0x0101010102020102, 0x0101010102020201, 0x0101010200000000,
+    0x0101010200000001, 0x0101010200000002, 0x0101010200000100, 0x0101010200000102,
+    0x0101010200000200, 0x0101010200000201, 0x0101010200010001, 0x0101010200010100,
+    0x0101010200010101, 0x0101010200010200, 0x0101010200010201, 0x0101010200020000,
+    0x0101010200020001, 0x0101010200020002, 0x0101010200020100, 0x0101010200020101,
+    0x0101010200020102, 0x0101010200020200, 0x0101010200020201, 0x0101010201000001,
+    0x0101010201000101, 0x0101010201000102, 0x0101010201000200, 0x0101010201000201,
+    0x0101010201000202, 0x0101010201010000, 0x0101010201010001, 0x0101010201010002,
+    0x0101010201010100, 0x0101010201010101, 0x0101010201010102, 0x0101010201010200,
+    0x0101010201010201, 0x0101010201010202, 0x0101010201020001, 0x0101010201020100,
+    0x0101010201020101, 0x0101010201020201, 0x0101010202000002, 0x0101010202000101,
+    0x0101010202000102, 0x0101010202000200, 0x0101010202000201, 0x0101010202000202,
+    0x0101010202010001, 0x0101010202010101, 0x0101010202010202, 0x0101010202020002,
+    0x0101010202020101, 0x0101010202020102, 0x0101010202020200, 0x0101010202020201,
+    0x0101020000000100, 0x0101020000000101, 0x0101020000000102, 0x0101020000000201,
+    0x0101020000010000, 0x0101020000010101, 0x0101020000010200, 0x0101020000020001,
+    0x0101020000020202, 0x0101020001000101, 0x0101020001000200, 0x0101020001000202,
+    0x0101020001010001, 0x0101020001010100, 0x0101020001010101, 0x0101020001010102,
+    0x0101020001010200, 0x0101020001010201, 0x0101020001020000, 0x0101020001020002,
+    0x0101020001020100, 0x0101020001020101, 0x0101020002000002, 0x0101020002000201,
+    0x0101020002010000, 0x0101020002010002, 0x0101020002010101, 0x0101020002010200,
+    0x0101020002020001, 0x0101020002020201, 0x0101020100000001, 0x0101020100000002,
+    0x0101020100000101, 0x0101020100000202, 0x0101020100010001, 0x0101020100010100,
+    0x0101020100010101, 0x0101020100010102, 0x0101020100010201, 0x0101020100020101,
+    0x0101020101000001, 0x0101020101000100, 0x0101020101000101, 0x0101020101000102,
+    0x0101020101000201, 0x0101020101010000, 0x0101020101010001, 0x0101020101010002,
+    0x0101020101010100, 0x0101020101010101, 0x0101020101010102, 0x0101020101010200,
+    0x0101020101010201, 0x0101020101010202, 0x0101020101020001, 0x0101020101020100,
+    0x0101020101020101, 0x0101020101020102, 0x0101020101020201, 0x0101020102000001,
+    0x0101020102000101, 0x0101020102000201, 0x0101020102010001, 0x0101020102010100,
+    0x0101020102010101, 0x0101020102010102, 0x0101020102010200, 0x0101020102010201,
+    0x0101020102020101, 0x0101020200000100, 0x0101020200000200, 0x0101020200010101,
+    0x0101020200010202, 0x0101020200020000, 0x0101020200020101, 0x0101020200020102,
+    0x0101020200020201, 0x0101020201000101, 0x0101020201000200, 0x0101020201000201,
+    0x0101020201010001, 0x0101020201010101, 0x0101020201010102, 0x0101020201010200,
+    0x0101020201010201, 0x0101020201020002, 0x0101020201020101, 0x0101020201020200,
+    0x0101020201020202, 0x0101020202000001, 0x0101020202000202, 0x0101020202010002,
+    0x0101020202010101, 0x0101020202010102, 0x0101020202010200, 0x0101020202010202,
+    0x0101020202020001, 0x0102000000000101, 0x0102000000010100, 0x0102000000010102,
+    0x0102000000010201, 0x0102000000020101, 0x0102000001000100, 0x0102000001010000,
+    0x0102000001010101, 0x0102000001010102, 0x0102000001010200, 0x0102000001010202,
+    0x0102000001020001, 0x0102000001020100, 0x0102000001020102, 0x0102000001020201,
+    0x0102000002000001, 0x0102000002010102, 0x0102000002020101, 0x0102000100000001,
+    0x0102000100000100, 0x0102000100000102, 0x0102000100000201, 0x0102000100010002,
+    0x0102000100010101, 0x0102000100020001, 0x0102000100020002, 0x0102000100020102,
+    0x0102000100020201, 0x0102000101000101, 0x0102000101000201, 0x0102000101010001,
+    0x0102000101010101, 0x0102000101010102, 0x0102000101010201, 0x0102000101020101,
+    0x0102000101020102, 0x0102000101020202, 0x0102000102000100, 0x0102000102000202,
+    0x0102000102010002, 0x0102000102010101, 0x0102000102020001, 0x0102000102020102,
+    0x0102000102020201, 0x0102000200010001, 0x0102000200010102, 0x0102000200010201,
+    0x0102000201000000, 0x0102000201000001, 0x0102000201000102, 0x0102000201010101,
+    0x0102000201010102, 0x0102000201010200, 0x0102000201020000, 0x0102000202000101,
+    0x0102000202010001, 0x0102000202010102, 0x0102000202020101, 0x0102010000010001,
+    0x0102010000010002, 0x0102010000010101, 0x0102010000010102, 0x0102010000010202,
+    0x0102010000020001, 0x0102010000020102, 0x0102010000020201, 0x0102010001000000,
+    0x0102010001000002, 0x0102010001000101, 0x0102010001000200, 0x0102010001000202,
+    0x0102010001010001, 0x0102010001010100, 0x0102010001010101, 0x0102010001010102,
+    0x0102010001010201, 0x0102010001010202, 0x0102010001020000, 0x0102010001020002,
+    0x0102010001020101, 0x0102010002000100, 0x0102010002000101, 0x0102010002000201,
+    0x0102010002010000, 0x0102010002010002, 0x0102010002010100, 0x0102010002010101,
+    0x0102010002010102, 0x0102010002010200, 0x0102010002010202, 0x0102010002020001,
+    0x0102010002020100, 0x0102010002020201, 0x0102010100000101, 0x0102010100000200,
+    0x0102010100000202, 0x0102010100010001, 0x0102010100010101, 0x0102010100010102,
+    0x0102010100010201, 0x0102010101000100, 0x0102010101000101, 0x0102010101000102,
+    0x0102010101000201, 0x0102010101010000, 0x0102010101010001, 0x0102010101010100,
+    0x0102010101010101, 0x0102010101010102, 0x0102010101010201, 0x0102010101020001,
+    0x0102010101020100, 0x0102010101020101, 0x0102010101020102, 0x0102010101020201,
+    0x0102010102000102, 0x0102010102000201, 0x0102010102000202, 0x0102010102010001,
+    0x0102010102010101, 0x0102010102010102, 0x0102010102010201, 0x0102010102010202,
+    0x0102010102020002, 0x0102010102020101, 0x0102010102020102, 0x0102010102020200,
+    0x0102010200000002, 0x0102010200000201, 0x0102010200010101, 0x0102010200020000,
+    0x0102010200020102, 0x0102010200020200, 0x0102010200020201, 0x0102010201000000,
+    0x0102010201000101, 0x0102010201000200, 0x0102010201000202, 0x0102010201010001,
+    0x0102010201010100, 0x0102010201010101, 0x0102010201010102, 0x0102010201010200,
+    0x0102010201010202, 0x0102010201020000, 0x0102010201020101, 0x0102010201020200,
+    0x0102010202000000, 0x0102010202000002, 0x0102010202000101, 0x0102010202000202,
+    0x0102010202010100, 0x0102010202010102, 0x0102010202010200, 0x0102010202010201,
+    0x0102010202020000, 0x0102010202020100, 0x0102010202020102, 0x0102010202020202,
+    0x0102020000010102, 0x0102020000010201, 0x0102020000020101, 0x0102020001000001,
+    0x0102020001010002, 0x0102020001010101, 0x0102020001010202, 0x0102020001020001,
+    0x0102020001020201, 0x0102020002000101, 0x0102020002010001, 0x0102020002010200,
+    0x0102020002020102, 0x0102020100000001, 0x0102020100000100, 0x0102020100010000,
+    0x0102020100010101, 0x0102020100020001, 0x0102020100020100, 0x0102020100020102,
+    0x0102020100020201, 0x0102020101000000, 0x0102020101000001, 0x0102020101000101,
+    0x0102020101000102, 0x0102020101000200, 0x0102020101010001, 0x0102020101010100,
+    0x0102020101010101, 0x0102020101010102, 0x0102020101010201, 0x0102020101020000,
+    0x0102020101020101, 0x0102020101020202, 0x0102020102000002, 0x0102020102000100,
+    0x0102020102000202, 0x0102020102010101, 0x0102020102020001, 0x0102020102020100,
+    0x0102020102020101, 0x0102020102020201, 0x0102020200010001, 0x0102020200010102,
+    0x0102020200010200, 0x0102020201000001, 0x0102020201000100, 0x0102020201000201,
+    0x0102020201010000, 0x0102020201010101, 0x0102020201010200, 0x0102020201010202,
+    0x0102020201020100, 0x0102020201020101, 0x0102020201020201, 0x0102020202000102,
+    0x0102020202010100, 0x0102020202010200, 0x0102020202010202, 0x0102020202020102,
+    0x0200000000000000, 0x0200000000000002, 0x0200000000000200, 0x0200000000000202,
+    0x0200000000020000, 0x0200000000020002, 0x0200000000020200, 0x0200000000020202,
+    0x0200000001000101, 0x0200000001010000, 0x0200000001010001, 0x0200000001010100,
+    0x0200000001010102, 0x0200000001010201, 0x0200000001020101, 0x0200000002000000,
+    0x0200000002000002, 0x0200000002000200, 0x0200000002000202, 0x0200000002010101,
+    0x0200000002020000, 0x0200000002020002, 0x0200000002020200, 0x0200000002020202,
+    0x0200000100000101, 0x0200000100010001, 0x0200000100010100, 0x0200000100010102,
+    0x0200000100010201, 0x0200000100020101, 0x0200000101000001, 0x0200000101000100,
+    0x0200000101000201, 0x0200000101010000, 0x0200000101010002, 0x0200000101010101,
+    0x0200000101010102, 0x0200000101010200, 0x0200000101010201, 0x0200000101020100,
+    0x0200000101020102, 0x0200000101020201, 0x0200000102000101, 0x0200000102000201,
+    0x0200000102010100, 0x0200000102010102, 0x0200000102010201, 0x0200000102020101,
+    0x0200000200000000, 0x0200000200000002, 0x0200000200000200, 0x0200000200000202,
+    0x0200000200010101, 0x0200000200020000, 0x0200000200020002, 0x0200000200020200,
+    0x0200000200020202, 0x0200000201010001, 0x0200000201010100, 0x0200000201010201,
+    0x0200000201020101, 0x0200000202000000, 0x0200000202000002, 0x0200000202000200,
+    0x0200000202000202, 0x0200000202010101, 0x0200000202020000, 0x0200000202020002,
+    0x0200000202020200, 0x0200000202020202, 0x0200010000010100, 0x0200010000010201,
+    0x0200010001000001, 0x0200010001000100, 0x0200010001010001, 0x0200010001010101,
+    0x0200010001010202, 0x0200010001020001, 0x0200010001020100, 0x0200010001020201,
+    0x0200010002010100, 0x0200010002010201, 0x0200010100000001, 0x0200010100000201,
+    0x0200010100010002, 0x0200010100010101, 0x0200010100010202, 0x0200010100020102,
+    0x0200010100020201, 0x0200010101000000, 0x0200010101000001, 0x0200010101000101,
+    0x0200010101000200, 0x0200010101010001, 0x0200010101010100, 0x0200010101010101,
+    0x0200010101010102, 0x0200010101010201, 0x0200010101010202, 0x0200010101020101,
+    0x0200010101020102, 0x0200010101020200, 0x0200010101020202, 0x0200010102000001,
+    0x0200010102000100, 0x0200010102000102, 0x0200010102000201, 0x0200010102010000,
+    0x0200010102010002, 0x0200010102010101, 0x0200010102010200, 0x0200010102020102,
+    0x0200010200010001, 0x0200010200010102, 0x0200010200010201, 0x0200010200020101,
+    0x0200010201000001, 0x0200010201000100, 0x0200010201000201, 0x0200010201000202,
+    0x0200010201010000, 0x0200010201010101, 0x0200010201010201, 0x0200010201010202,
+    0x0200010201020001, 0x0200010201020102, 0x0200010201020202, 0x0200010202000101,
+    0x0200010202010001, 0x0200010202010202, 0x0200010202020100, 0x0200020000000000,
+    0x0200020000000002, 0x0200020000000200, 0x0200020000000202, 0x0200020000010101,
+    0x0200020000020000, 0x0200020000020002, 0x0200020000020200, 0x0200020000020202,
+    0x0200020001000001, 0x0200020001000101, 0x0200020001010001, 0x0200020001010100,
+    0x0200020001010201, 0x0200020001020101, 0x0200020001020201, 0x0200020002000000,
+    0x0200020002000002, 0x0200020002000200, 0x0200020002000202, 0x0200020002010101,
+    0x0200020002020000, 0x0200020002020002, 0x0200020002020200, 0x0200020002020202,
+    0x0200020100000101, 0x0200020100000102, 0x0200020100010001, 0x0200020100010100,
+    0x0200020100010102, 0x0200020100020101, 0x0200020101000001, 0x0200020101000100,
+    0x0200020101000102, 0x0200020101000201, 0x0200020101010000, 0x0200020101010002,
+    0x0200020101010101, 0x0200020101010202, 0x0200020101020001, 0x0200020101020100,
+    0x0200020102000101, 0x0200020102010102, 0x0200020102010201, 0x0200020102020101,
+    0x0200020200000000, 0x0200020200000002, 0x0200020200000200, 0x0200020200000202,
+    0x0200020200010101, 0x0200020200020000, 0x0200020200020002, 0x0200020200020200,
+    0x0200020200020202, 0x0200020201000101, 0x0200020201010001, 0x0200020201010100,
+    0x0200020201010102, 0x0200020202000000, 0x0200020202000002, 0x0200020202000200,
+    0x0200020202000202, 0x0200020202010101, 0x0200020202020000, 0x0200020202020002,
+    0x0200020202020200, 0x0200020202020202, 0x0201000000000101, 0x0201000000010001,
+    0x0201000000010102, 0x0201000000010200, 0x0201000000010201, 0x0201000000020101,
+    0x0201000001000001, 0x0201000001000102, 0x0201000001000201, 0x0201000001010101,
+    0x0201000001010200, 0x0201000001010202, 0x0201000001020201, 0x0201000001020202,
+    0x0201000002000101, 0x0201000002010001, 0x0201000002010100, 0x0201000002010102,
+    0x0201000002010201, 0x0201000002020101, 0x0201000100000001, 0x0201000100000100,
+    0x0201000100000102, 0x0201000100000201, 0x0201000100010000, 0x0201000100010101,
+    0x0201000100010200, 0x0201000100010202, 0x0201000100020001, 0x0201000100020100,
+    0x0201000100020102, 0x0201000100020201, 0x0201000101000000, 0x0201000101000101,
+    0x0201000101010000, 0x0201000101010001, 0x0201000101010100, 0x0201000101010101,
+    0x0201000101010102, 0x0201000101010201, 0x0201000101020002, 0x0201000101020101,
+    0x0201000102000100, 0x0201000102000102, 0x0201000102010002, 0x0201000102010101,
+    0x0201000102010200, 0x0201000102020001, 0x0201000102020100, 0x0201000102020102,
+    0x0201000102020201, 0x0201000200000101, 0x0201000200010001, 0x0201000200010100,
+    0x0201000200010201, 0x0201000200020101, 0x0201000201000100, 0x0201000201000102,
+    0x0201000201000201, 0x0201000201010000, 0x0201000201010002, 0x0201000201010101,
+    0x0201000201010200, 0x0201000201020102, 0x0201000201020201, 0x0201000202000101,
+    0x0201000202010100, 0x0201000202010102, 0x0201000202020201, 0x0201010000000001,
+    0x0201010000000100, 0x0201010000000102, 0x0201010000010000, 0x0201010000010101,
+    0x0201010000010200, 0x0201010000020102, 0x0201010001000000, 0x0201010001000202,
+    0x0201010001010001, 0x0201010001010100, 0x0201010001010101, 0x0201010001010102,
+    0x0201010001010200, 0x0201010001010201, 0x0201010001020000, 0x0201010001020001,
+    0x0201010001020002, 0x0201010001020101, 0x0201010002000100, 0x0201010002000102,
+    0x0201010002010002, 0x0201010002010100, 0x0201010002010101, 0x0201010002010200,
+    0x0201010002020001, 0x0201010002020201, 0x0201010100000000, 0x0201010100000101,
+    0x0201010100000200, 0x0201010100000202, 0x0201010100010000, 0x0201010100010001,
+    0x0201010100010100, 0x0201010100010101, 0x0201010100010102, 0x0201010100010201,
+    0x0201010100020001, 0x0201010100020101, 0x0201010100020201, 0x0201010100020202,
+    0x0201010101000001, 0x0201010101000100, 0x0201010101000101, 0x0201010101000102,
+    0x0201010101000201, 0x0201010101010000, 0x0201010101010001, 0x0201010101010002,
+    0x0201010101010100, 0x0201010101010101, 0x0201010101010102, 0x0201010101010200,
+    0x0201010101010201, 0x0201010101010202, 0x0201010101020001, 0x0201010101020100,
+    0x0201010101020101, 0x0201010101020102, 0x0201010101020201, 0x0201010102000001,
+    0x0201010102000101, 0x0201010102000200, 0x0201010102010001, 0x0201010102010002,
+    0x0201010102010100, 0x0201010102010101, 0x0201010102010102, 0x0201010102010201,
+    0x0201010102010202, 0x0201010102020000, 0x0201010102020002, 0x0201010102020101,
+    0x0201010102020200, 0x0201010102020202, 0x0201010200000001, 0x0201010200000100,
+    0x0201010200010000, 0x0201010200010101, 0x0201010200010201, 0x0201010200020000,
+    0x0201010200020102, 0x0201010200020201, 0x0201010201000101, 0x0201010201000200,
+    0x0201010201000201, 0x0201010201010001, 0x0201010201010002, 0x0201010201010101,
+    0x0201010201010102, 0x0201010201010201, 0x0201010201020101, 0x0201010201020200,
+    0x0201010202000002, 0x0201010202000100, 0x0201010202000201, 0x0201010202000202,
+    0x0201010202010002, 0x0201010202010100, 0x0201010202010101, 0x0201010202020100,
+    0x0201010202020102, 0x0201010202020201, 0x0201020000000101, 0x0201020000010102,
+    0x0201020000010201, 0x0201020000020101, 0x0201020001000001, 0x0201020001000102,
+    0x0201020001010000, 0x0201020001010002, 0x0201020001010101, 0x0201020001010102,
+    0x0201020001010202, 0x0201020001020100, 0x0201020001020101, 0x0201020002000101,
+    0x0201020002010001, 0x0201020002010102, 0x0201020002010201, 0x0201020002020101,
+    0x0201020100000100, 0x0201020100000102, 0x0201020100000201, 0x0201020100010000,
+    0x0201020100010002, 0x0201020100010101, 0x0201020100010200, 0x0201020100010202,
+    0x0201020100020000, 0x0201020100020001, 0x0201020100020100, 0x0201020100020102,
+    0x0201020101000000, 0x0201020101000002, 0x0201020101000101, 0x0201020101000200,
+    0x0201020101000202, 0x0201020101010001, 0x0201020101010100, 0x0201020101010101,
+    0x0201020101010102, 0x0201020101010201, 0x0201020101020002, 0x0201020101020101,
+    0x0201020101020102, 0x0201020101020202, 0x0201020102000001, 0x0201020102000100,
+    0x0201020102010000, 0x0201020102010002, 0x0201020102010101, 0x0201020102010202,
+    0x0201020102020001, 0x0201020102020102, 0x0201020200000101, 0x0201020200010101,
+    0x0201020200020101, 0x0201020201000100, 0x0201020201000102, 0x0201020201000201,
+    0x0201020201010000, 0x0201020201010101, 0x0201020201010200, 0x0201020201020001,
+    0x0201020202000101, 0x0201020202010001, 0x0201020202010100, 0x0201020202010101,
+    0x0201020202010102, 0x0202000000000000, 0x0202000000000002, 0x0202000000000200,
+    0x0202000000000202, 0x0202000000010101, 0x0202000000020000, 0x0202000000020002,
+    0x0202000000020200, 0x0202000000020202, 0x0202000001000101, 0x0202000001010001,
+    0x0202000001010100, 0x0202000001010102, 0x0202000001010201, 0x0202000002000000,
+    0x0202000002000002, 0x0202000002000200, 0x0202000002000202, 0x0202000002010101,
+    0x0202000002020000, 0x0202000002020002, 0x0202000002020200, 0x0202000002020202,
+    0x0202000100000101, 0x0202000100000201, 0x0202000100010001, 0x0202000100010100,
+    0x0202000100010102, 0x0202000100010201, 0x0202000100010202, 0x0202000101000102,
+    0x0202000101000201, 0x0202000101010001, 0x0202000101010101, 0x0202000101010200,
+    0x0202000101010202, 0x0202000101020001, 0x0202000101020100, 0x0202000102000101,
+    0x0202000102010000, 0x0202000102010002, 0x0202000102010102, 0x0202000102010201,
+    0x0202000200000002, 0x0202000200000200, 0x0202000200000202, 0x0202000200010000,
+    0x0202000200010201, 0x0202000200020002, 0x0202000200020200, 0x0202000200020202,
+    0x0202000201000101, 0x0202000201010001, 0x0202000201010102, 0x0202000201010201,
+    0x0202000201020101, 0x0202000202000000, 0x0202000202000002, 0x0202000202000200,
+    0x0202000202000202, 0x0202000202010101, 0x0202000202020000, 0x0202000202020002,
+    0x0202000202020200, 0x0202000202020202, 0x0202010000010201, 0x0202010000020101,
+    0x0202010001000001, 0x0202010001000100, 0x0202010001010000, 0x0202010001010100,
+    0x0202010001010101, 0x0202010001010200, 0x0202010001010202, 0x0202010001020001,
+    0x0202010001020101, 0x0202010001020102, 0x0202010001020200, 0x0202010001020201,
+    0x0202010002000101, 0x0202010100000102, 0x0202010100000201, 0x0202010100010000,
+    0x0202010100010002, 0x0202010100010101, 0x0202010100010200, 0x0202010100020102,
+    0x0202010100020201, 0x0202010101000002, 0x0202010101000101, 0x0202010101010001,
+    0x0202010101010100, 0x0202010101010101, 0x0202010101010102, 0x0202010101010201,
+    0x0202010101020101, 0x0202010101020202, 0x0202010102000001, 0x0202010102000100,
+    0x0202010102000101, 0x0202010102000102, 0x0202010102000201, 0x0202010102010002,
+    0x0202010102010101, 0x0202010102010200, 0x0202010200000101, 0x0202010200010001,
+    0x0202010200010102, 0x0202010200010202, 0x0202010200020001, 0x0202010200020101,
+    0x0202010201000100, 0x0202010201000102, 0x0202010201000202, 0x0202010201010002,
+    0x0202010201010101, 0x0202010201010102, 0x0202010201010200, 0x0202010201020000,
+    0x0202010201020002, 0x0202010202000102, 0x0202010202010000, 0x0202010202010101,
+    0x0202010202010102, 0x0202010202010201, 0x0202010202020001, 0x0202010202020100,
+    0x0202010202020102, 0x0202020000000000, 0x0202020000000002, 0x0202020000000200,
+    0x0202020000000202, 0x0202020000020000, 0x0202020000020002, 0x0202020000020200,
+    0x0202020000020202, 0x0202020001010001, 0x0202020001010100, 0x0202020001010102,
+    0x0202020001010201, 0x0202020002000000, 0x0202020002000002, 0x0202020002000200,
+    0x0202020002000202, 0x0202020002010101, 0x0202020002020000, 0x0202020002020002,
+    0x0202020002020200, 0x0202020002020202, 0x0202020100000101, 0x0202020100010100,
+    0x0202020100010201, 0x0202020100020001, 0x0202020100020101, 0x0202020101000001,
+    0x0202020101010000, 0x0202020101010101, 0x0202020101010202, 0x0202020101020001,
+    0x0202020101020102, 0x0202020101020201, 0x0202020102010000, 0x0202020102010102,
+    0x0202020200000000, 0x0202020200000002, 0x0202020200000200, 0x0202020200000202,
+    0x0202020200020000, 0x0202020200020002, 0x0202020200020200, 0x0202020200020202,
+    0x0202020201010001, 0x0202020201010100, 0x0202020201010102, 0x0202020202000000,
+    0x0202020202000002, 0x0202020202000200, 0x0202020202000202, 0x0202020202010101,
+    0x0202020202020000, 0x0202020202020002, 0x0202020202020200, 0x0202020202020202,
+};
+#else
+static const uint32_t iq1s_grid_us[2048] = {
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
+};
+#endif
+
+#ifndef HAVE_FANCY_SIMD
 const uint64_t keven_signs[128] = {
     0x0101010101010101, 0xff010101010101ff, 0xff0101010101ff01, 0x010101010101ffff,
     0xff01010101ff0101, 0x0101010101ff01ff, 0x0101010101ffff01, 0xff01010101ffffff,
@@ -181,31 +989,41 @@ const uint64_t keven_signs[128] = {
     0x01ffffffff010101, 0xffffffffff0101ff, 0xffffffffff01ff01, 0x01ffffffff01ffff,
     0xffffffffffff0101, 0x01ffffffffff01ff, 0x01ffffffffffff01, 0xffffffffffffffff,
 };
+#endif
 
 }
 
-bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
-        float * C, long stride_C, int ith, int nth) {
+/* moonll change mulmat
+add typeB and strideB
+}*/
 
-    MulMat mm;
-    int row_size_q8;
-    if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
-        return false;
-    }
+bool iqk_mul_mat(long Nx, long Ny, long ne00,
+    int typeA, const void * A, long strideA,
+    int typeB, const void * B, long strideB,
+    float * C, long stride_C, int ith, int nth) {
 
-    auto row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
+        MulMat mm;
+    
+        if (!MulMat::set_mul_mat(typeA, typeB, ne00, mm, Ny)) {
+            return false;
+        }
 
-    auto nrc_x = (Nx + nth - 1)/nth;
-    auto first_x = ith*nrc_x;
-    if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
+        size_t row_size_qx = strideA*ggml_type_size(ggml_type(typeA));
+        size_t row_size_qy = strideB*ggml_type_size(ggml_type(typeB));
+      
+        
+        auto nrc_x = (Nx + nth - 1)/nth;
+        auto first_x = ith*nrc_x;
+        if (first_x + nrc_x > Nx) nrc_x = Nx - first_x;
 
-    DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, (size_t)row_size_q8, 0, 1, nullptr, 0};
+        DataInfo info{C + first_x, (const char *)B, (size_t)stride_C, row_size_qy, 0, 1, nullptr, 0};
 
-    mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
+        mm.mul_mat_NxM(ne00, (const char *)A + row_size_qx*first_x, row_size_qx, info, nrc_x, Ny);
 
-    return true;
+        return true;
 }
 
+
 bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const void * A, const void * B,
         float * C, long nb1, long nb2, const void * vrow_mapping, int ith, int nth) {
     const mmid_row_mapping * row_mapping = (const mmid_row_mapping *)vrow_mapping;
@@ -213,9 +1031,11 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
 
     MulMat mm;
     int row_size_q8;
+    /* moonll
+
     if (!MulMat::set_mul_mat(typeA, ne00, mm, row_size_q8, Ny)) {
         return false;
-    }
+    }*/
     int row_size_qx = ggml_row_size((ggml_type)typeA, ne00);
     int nrc_x = (Nx + nth - 1)/nth;
     int first_x = ith*nrc_x;
@@ -233,6 +1053,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
 #if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
     #define HAVE_FANCY_SIMD
 #endif
+//#define HAVE_FANCY_SIMD
 
 namespace {
 
@@ -257,10 +1078,9 @@ template <int nrc, typename block_q8 = block_q8_K> struct Q8 {
     }
 
 #ifdef HAVE_FANCY_SIMD
-    inline __m512i load_quants(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
-#else
-    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
+    inline __m512i load_quants64(int iy, int i, int j) const { return _mm512_loadu_si512((const __m512i*)y[iy][i].qs + j); }
 #endif
+    inline __m256i load_quants(int iy, int i, int j) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].qs + j); }
     inline __m256i load_bsums(int iy, int i) const { return _mm256_loadu_si256((const __m256i*)y[iy][i].bsums); }
     inline float scale(int iy, int i) const { return y[iy][i].d; }
 
@@ -353,6 +1173,23 @@ struct ScaleIQ4XS {
     const __m128i m32 = _mm_set1_epi16(-32);
 };
 
+struct Scales8KBase {
+    template <typename Q8>
+    inline void accum_mins(const __m128i& mins128, const Q8& q8, int i, float c, __m256 * accd) const {
+        const __m256i mins = MM256_SET_M128I(_mm_shuffle_epi8(mins128, shuffles[1]), _mm_shuffle_epi8(mins128, shuffles[0]));
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i q8s = q8.load_bsums(iy, i);
+            const __m256i prod = _mm256_madd_epi16(mins, q8s);
+            accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(c*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
+        }
+    }
+    inline __m256i shuffle(__m128i mins) const {
+        return MM256_SET_M128I(_mm_shuffle_epi8(mins, shuffles[1]), _mm_shuffle_epi8(mins, shuffles[0]));
+    }
+    const __m128i shuffles[2] = {_mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100),
+                                 _mm_set_epi32(0x0f0e0f0e, 0x0d0c0d0c, 0x0b0a0b0a, 0x09080908)};
+};
+
 template <typename Block>
 struct BaseDequantizer {
     BaseDequantizer(const void * vx, size_t bx) : vx(vx), bx(bx) {}
@@ -367,6 +1204,16 @@ struct BaseDequantizer {
     float d;
 };
 
+__m128i inline load_iq4nl_values_128() {
+    static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
+    return _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
+}
+
+__m256i inline load_iq4nl_values_256() {
+    auto val128 = load_iq4nl_values_128();
+    return MM256_SET_M128I(val128, val128);
+}
+
 #ifdef HAVE_FANCY_SIMD
 //====================================== Zen4 ==================================================
 
@@ -434,8 +1281,17 @@ struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
     Scales8K s8k;
 };
 
+/*
+moonll DequantizerIQ4XS
+*/
+
+__m512i inline load_iq4nl_values_512() {
+    auto val256 = load_iq4nl_values_256();
+    return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
+}
+
 struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
-    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_values()) {}
+    DequantizerIQ4XS(const void * vx, size_t bx) : BaseDequantizer(vx, bx), values(load_iq4nl_values_512()) {}
     template <typename Q8>
     inline void new_block(int i, const Q8& q8, __m256 * accd, __m512i * scales) {
         d = GGML_FP16_TO_FP32(x[i].d);
@@ -444,14 +1300,10 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
         s8k.accum_mins(scales128, q8, i, -128.f*d, accd);
         auto scales256 = MM256_SET_M128I(scales128, scales128);
         auto all_scales = _mm512_inserti32x8(_mm512_castsi256_si512(scales256), scales256, 1);
-        scales[0] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[0]);
-        scales[1] = _mm512_shuffle_epi8(all_scales, s8k.shuffles512[1]);
-    }
-    static __m512i load_values() {
-        static const uint8_t kvalues_iq4nl[16] = {1, 24, 45, 63, 79, 93, 106, 118, 129, 141, 153, 166, 181, 197, 217, 241};
-        auto val128 = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
-        auto val256 = MM256_SET_M128I(val128, val128);
-        return _mm512_inserti32x8(_mm512_castsi256_si512(val256), val256, 1);
+        scales[0] = _mm512_shuffle_epi8(all_scales, shuffles[0]);
+        scales[1] = _mm512_shuffle_epi8(all_scales, shuffles[1]);
+        scales[2] = _mm512_shuffle_epi8(all_scales, shuffles[2]);
+        scales[3] = _mm512_shuffle_epi8(all_scales, shuffles[3]);
     }
     inline void prepare(const uint8_t * q4) {
         bits.prepare64(q4);
@@ -467,11 +1319,17 @@ struct DequantizerIQ4XS final : public BaseDequantizer<block_iq4_xs> {
     }
 
     Q4Bits bits;
-    Scales8K s8k;
+    Scales8KBase s8k;
     ScaleIQ4XS siq4;
     const __m512i values;
     const __m512i permute1 = _mm512_set_epi64(11, 10, 3, 2,  9,  8, 1, 0);
     const __m512i permute2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
+    const __m512i shuffles[4] = {
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0100), _mm256_set1_epi16(0x0302), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0504), _mm256_set1_epi16(0x0706), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0908), _mm256_set1_epi16(0x0b0a), 1),
+        _mm512_inserti32x8(_mm512_set1_epi16(0x0d0c), _mm256_set1_epi16(0x0f0e), 1),
+    };
 };
 
 struct HighBit5 {
@@ -646,6 +1504,149 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
 
     }
 }
+template <typename Q8>
+inline void compute_block(int iy, int i, float d, const Q8& q8, const __m512i * values, const __m512i * scales, __m512 * accd) {
+    const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[0], q8.load_quants64(iy, i, 0));
+    const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[1], q8.load_quants64(iy, i, 1));
+    const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[2], q8.load_quants64(iy, i, 2));
+    const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), values[3], q8.load_quants64(iy, i, 3));
+    auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
+    sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
+    accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    Q8<nrc_y> q8(info);
+
+    Dequantizer deq(vx, bx);
+
+    __m256  accm[nrc_y];
+    __m512  accd[nrc_y];
+    __m512i scales[2];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
+        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            deq.new_block(i, q8, accm, scales);
+
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m512i p1 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[0], q8.load_quants64(iy, i, 0));
+                const __m512i p2 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[1], q8.load_quants64(iy, i, 1));
+                const __m512i p3 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[2], q8.load_quants64(iy, i, 2));
+                const __m512i p4 = _mm512_dpbusd_epi32(_mm512_setzero_si512(), deq.bits.values[3], q8.load_quants64(iy, i, 3));
+                auto sumi = _mm512_dpwssd_epi32(_mm512_setzero_si512(), scales[0], _mm512_packs_epi32(p1, p2));
+                sumi = _mm512_dpwssd_epi32(sumi, scales[1], _mm512_packs_epi32(p3, p4));
+                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+            }
+
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
+            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
+        }
+
+    }
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_iqX_k_q8_K_AVX512(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    Q8<nrc_y> q8(info);
+
+    Dequantizer deq(vx, bx);
+
+    __m256  accm[nrc_y];
+    __m512  accd[nrc_y];
+    __m512i scales[4];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm512_setzero_ps();
+        for (int iy = 0; iy < nrc_y; ++iy) accm[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            deq.new_block(i, q8, accm, scales);
+
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m512i p1 = _mm512_maddubs_epi16(deq.bits.values[0], q8.load_quants64(iy, i, 0));
+                const __m512i p2 = _mm512_maddubs_epi16(deq.bits.values[1], q8.load_quants64(iy, i, 1));
+                const __m512i p3 = _mm512_maddubs_epi16(deq.bits.values[2], q8.load_quants64(iy, i, 2));
+                const __m512i p4 = _mm512_maddubs_epi16(deq.bits.values[3], q8.load_quants64(iy, i, 3));
+                auto sumi = _mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_dpwssd_epi32(_mm512_setzero_si512(),
+                                    p1, scales[0]), p2, scales[1]), p3, scales[2]), p4, scales[3]);
+                accd[iy] = _mm512_fmadd_ps(_mm512_set1_ps(deq.d*q8.scale(iy, i)), _mm512_cvtepi32_ps(sumi), accd[iy]);
+            }
+
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd[iy]), _mm512_extractf32x8_ps(accd[iy], 1));
+            info.store(ix, iy, hsum_float_8(_mm256_add_ps(accm[iy], sum256)));
+        }
+
+    }
+}
+
+template <typename Dequantizer>
+static void mul_mat_qX_K_q8_K_AVX512_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+    const int nb = n / QK_K;
+
+    constexpr int k_nx = 2;
+
+    Q8<1> q8(info);
+
+    Dequantizer deq1(vx, bx);
+    Dequantizer deq2(vx, bx);
+
+    Dequantizer * deq[k_nx];
+    deq[0] = &deq1;
+    deq[1] = &deq2;
+
+    __m512i scales[2*k_nx];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        auto accd = _mm512_setzero_ps();
+        auto accm = _mm256_setzero_ps();
+
+        for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_row(ix);
+
+        for (int i = 0; i < nb/k_nx; ++i) {
+
+            for (int kx = 0; kx < k_nx; ++kx) deq[kx]->new_block(k_nx*i+kx, q8, &accm, scales+2*kx);
+
+            for (int kx = 0; kx < k_nx; ++kx) {
+                compute_block(0, k_nx*i+kx, deq[kx]->d, q8, deq[kx]->bits.values, scales+2*kx, &accd);
+            }
+
+        }
+        if (2*(nb/2) < nb) {
+            int i0 = 2*(nb/2);
+            deq[0]->new_block(i0, q8, &accm, scales);
+            compute_block(0, i0, deq[0]->d, q8, deq[0]->bits.values, scales, &accd);
+        }
+
+        auto sum256 = _mm256_add_ps(_mm512_castps512_ps256(accd), _mm512_extractf32x8_ps(accd, 1));
+        info.store(ix, 0, hsum_float_8(_mm256_add_ps(accm, sum256)));
+    }
+}
 
 #else
 // ===================================== Vanilla AVX2 =====================================
@@ -724,17 +1725,8 @@ struct HighBit3 {
     __m256i hbits;
 };
 
-inline __m256i get_scale_shuffle_8(int i) {
-    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
-}
-
-inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
-    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
-    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
-    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
-    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
-}
 
+/*
 template <typename Q8, typename Bits>
 inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
     if (j == 0) {
@@ -755,7 +1747,7 @@ inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i,
             sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
         }
     }
-}
+}*/
 
 struct DequantizerQ4K final : public BaseDequantizer<block_q4_K> {
     DequantizerQ4K(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
@@ -889,22 +1881,8 @@ struct DequantizerQ6K final : public BaseDequantizer<block_q6_K> {
     const __m256i mh = _mm256_set1_epi8(0x30);
 };
 
-inline __m256i get_scale_shuffle_16(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
 
-inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
-    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
-    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
-    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
-    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
-}
+
 
 template <typename Dequantizer, int nrc_y>
 static void mul_mat_qY_K_q8_K_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
@@ -1000,6 +1978,8 @@ static void mul_mat_qX_K_q8_K_T(int n, const void * vx, size_t bx, const DataInf
 }
 #endif  // Zen4 or vanilla AVX2
 
+
+
 //
 // ============================== Legacy quants
 //
@@ -1075,6 +2055,28 @@ struct ScaleHelperQ_0 {
     template <typename Q> inline float prepare1(const Q * y) const { return GGML_FP16_TO_FP32(y->d); }
     template <typename Q> inline float prepare1(float d, const Q * y) const { return d*prepare1(y); }
 };
+template <int min_value>
+struct ScaleHelperQ_0_1 {
+    ggml_half scales8[4];
+    template <typename Q>
+    inline __m256 prepare4(const Q * y) {
+        for (int j = 0; j < 4; ++j) scales8[j] = y[j].d;
+        auto s4 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)scales8));
+        return _mm256_set_m128(_mm_mul_ps(s4, min), s4);
+    }
+    template <typename Q>
+    inline __m256 prepare4(__m256 other_scales, const Q * y) {
+        return _mm_mul256_ps(other_scales, prepare4<Q>(y));
+    }
+    template <typename Q> inline std::pair<float, float> prepare1(const Q * y) const {
+        float d = GGML_FP16_TO_FP32(y->d);
+        return std::make_pair(d, -d*float(min_value));
+    }
+    std::pair<float, float> inline prepare1(const std::pair<float, float>& dm, const block_q8_1 * y) const {
+        return std::make_pair(dm.first*GGML_FP16_TO_FP32(y->d), dm.second*GGML_FP16_TO_FP32(y->s));
+    }
+    const __m128 min = _mm_set1_ps(float(-min_value));
+};
 
 struct ScaleHelperQ_1 {
     uint32_t scales8[4];
@@ -1235,6 +2237,12 @@ struct Q8_0_Dequantizer {
     }
 };
 
+struct Q8_0_1_Dequantizer {
+    inline __m256i dequant(const block_q8_0 * x) const {
+        return _mm256_add_epi8(_mm256_set1_epi8(127), _mm256_loadu_si256((const __m256i *)x->qs));
+    }
+};
+
 struct Q4_0_Dequantizer {
     Dequantizer4bit b4;
     const __m256i m8 = _mm256_set1_epi8(-8);
@@ -1320,6 +2328,11 @@ struct Q8_0_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0, Q8_0_
     Q8_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
     inline static int block_size() { return QK4_0; }
 };
+struct Q8_0_1_Unpacker final : public Q_Unpacker<block_q8_0, ScaleHelperQ_0_1<127>, Q8_0_1_Dequantizer> {
+    Q8_0_1_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
+//    using Sum4T = Sum4TypeQ81;
+    inline static int block_size() { return QK8_0; }
+};
 struct Q4_0_Unpacker final : public Q_Unpacker<block_q4_0, ScaleHelperQ_0, Q4_0_Dequantizer> {
     Q4_0_Unpacker(const void * vx, size_t bx) : Q_Unpacker(vx, bx) {}
     inline static int block_size() { return QK4_0; }
@@ -1353,8 +2366,466 @@ void mul_mat_q8_0_q8_0_T(int n, const void * vx, size_t bx, const DataInfo& info
     }
 }
 
+
+
+
+/*
+moonll
+add some structs for DequantizerIQ2XXS
+SimpleBits
+EvenSignHelper
+*/
+struct SimpleBits {
+    __m256i values[4];
+};
+
+
+
+struct EvenSignHelper {
+    #ifdef HAVE_FANCY_SIMD
+        union sbits_t {
+            __m128i vec;
+            __mmask32 mask[4];
+        };
+        IQK_ALWAYS_INLINE void sign_2_values(__m256i aux, __m256i * values) const {
+            aux = _mm256_and_si256(_mm256_srlv_epi32(aux, shifts), mask);
+            auto pcnt = _mm256_popcnt_epi32(aux);
+            sbits_t sbits;
+            sbits.vec = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
+            values[0] = _mm256_mask_sub_epi8(values[0], sbits.mask[0], _mm256_setzero_si256(), values[0]);
+            values[1] = _mm256_mask_sub_epi8(values[1], sbits.mask[1], _mm256_setzero_si256(), values[1]);
+            //auto sign_bits = _mm256_cvtepi32_epi8(_mm256_or_si256(aux, _mm256_slli_epi32(_mm256_and_si256(pcnt, mone), 7)));
+            //const __mmask32 * m32 = (const __mmask32 *)&sign_bits;
+            //values[0] = _mm256_mask_sub_epi8(values[0], m32[0], _mm256_setzero_si256(), values[0]);
+            //values[1] = _mm256_mask_sub_epi8(values[1], m32[1], _mm256_setzero_si256(), values[1]);
+        }
+        const __m256i shifts = _mm256_set_epi32(21, 14, 7, 0, 21, 14, 7, 0);
+        const __m256i mask   = _mm256_set1_epi32(127);
+        const __m256i mone   = _mm256_set1_epi32(1);
+    #else
+        inline void sign_value(uint32_t aux32, __m256i& value) const {
+            auto signs = _mm256_set_epi64x(keven_signs[(aux32 >> 21) & 127], keven_signs[(aux32 >> 14) & 127],
+                                           keven_signs[(aux32 >>  7) & 127], keven_signs[(aux32 >>  0) & 127]);
+            value = _mm256_sign_epi8(value, signs);
+        }
+    #endif
+};
+
+/*
+moonll ad multiply_add for mul_mat_qX_K_q8_K_IQ_1
+add func
+get_scale_shuffle_8
+get_scale_shuffle_16
+set_scales_16
+*/
+
+inline __m256i get_scale_shuffle_8(int i) {
+    return _mm256_set1_epi16((2*i) | ((2*i+1) << 8));
+}
+
+inline void set_scales_8(const __m256i& all_scales, int j, __m256i * scales) {
+    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+0));
+    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+1));
+    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+2));
+    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_8(4*j+3));
+}
+
+
+inline __m256i get_scale_shuffle_16(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+
+inline void set_scales_16(const __m256i& all_scales, __m256i * scales) {
+    scales[0] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(0));
+    scales[1] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(1));
+    scales[2] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(2));
+    scales[3] = _mm256_shuffle_epi8(all_scales, get_scale_shuffle_16(3));
+}
+
+
+template <typename Q8, typename Bits>
+inline void multiply_add(const Bits& bits, const __m256i * scales, int j, int i, const Q8& q8, __m256i * sumi) {
+    if (j == 0) {
+#ifdef HAVE_FANCY_SIMD
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            sumi[iy] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
+        }
+#else
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 0)));
+            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 1)));
+            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 2)));
+            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 3)));
+            sumi[iy] = _mm256_add_epi32(_mm256_add_epi32(p1, p3), _mm256_add_epi32(p2, p4));
+        }
+#endif
+    } else {
+#ifdef HAVE_FANCY_SIMD
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
+            sumi[iy] = _mm256_dpwssd_epi32(sumi[iy], scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
+        }
+#else
+        for (int iy = 0; iy < Q8::nrc_y; ++iy) {
+            const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8.load_quants(iy, i, 4)));
+            const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8.load_quants(iy, i, 5)));
+            const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8.load_quants(iy, i, 6)));
+            const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8.load_quants(iy, i, 7)));
+            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p1, p3));
+            sumi[iy] = _mm256_add_epi32(sumi[iy], _mm256_add_epi32(p2, p4));
+        }
+#endif
+    }
+}
+
+/*
+moonll ad multiply_add_1 for mul_mat_qX_K_q8_K_IQ_1
+add func
+set_scales_8_iq
+set_scales_16_iq
+
+add MUL_MAT
+mul_mat_qX_K_q8_K_IQ_1
+mul_mat_qX_K_q8_K_IQ_N
+mul_mat_qX_K_q8_K_IQ
+*/
+
+template <typename Bits>
+inline void multiply_add_1(int j, const Bits& bits, const __m256i * scales, const __m256i * q8, __m256i * sumi) {
+    if (j == 0) {
+#ifdef HAVE_FANCY_SIMD
+        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
+        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
+        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
+        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
+        sumi[0] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[0], _mm256_packs_epi32(p1, p2));
+        sumi[1] = _mm256_dpwssd_epi32(_mm256_setzero_si256(), scales[1], _mm256_packs_epi32(p3, p4));
+#else
+        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
+        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
+        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
+        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
+        sumi[0] = _mm256_add_epi32(p1, p3);
+        sumi[1] = _mm256_add_epi32(p2, p4);
+#endif
+    } else {
+#ifdef HAVE_FANCY_SIMD
+        auto p1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[0], q8[0]);
+        auto p2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[1], q8[1]);
+        auto p3 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[2], q8[2]);
+        auto p4 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), bits.values[3], q8[3]);
+        sumi[0] = _mm256_dpwssd_epi32(sumi[0], scales[0], _mm256_packs_epi32(p1, p2));
+        sumi[1] = _mm256_dpwssd_epi32(sumi[1], scales[1], _mm256_packs_epi32(p3, p4));
+#else
+        const __m256i p1 = _mm256_madd_epi16(scales[0], _mm256_maddubs_epi16(bits.values[0], q8[0]));
+        const __m256i p2 = _mm256_madd_epi16(scales[1], _mm256_maddubs_epi16(bits.values[1], q8[1]));
+        const __m256i p3 = _mm256_madd_epi16(scales[2], _mm256_maddubs_epi16(bits.values[2], q8[2]));
+        const __m256i p4 = _mm256_madd_epi16(scales[3], _mm256_maddubs_epi16(bits.values[3], q8[3]));
+        sumi[0] = _mm256_add_epi32(sumi[0], _mm256_add_epi32(p1, p3));
+        sumi[1] = _mm256_add_epi32(sumi[1], _mm256_add_epi32(p2, p4));
+#endif
+    }
+}
+
+
+inline void set_scales_8_iq(int j, const __m256i& all_scales, __m256i * scales) {
+    //#ifdef HAVE_FANCY_SIMD
+        auto shuffle = j == 0 ? _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100)
+                              : _mm256_set_epi64x(0x0b0a0b0a0b0a0b0a, 0x0908090809080908, 0x0b0a0b0a0b0a0b0a, 0x0908090809080908);
+        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
+        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(4)));
+    //#else
+    //    set_scales_8(all_scales, j, scales);
+    //#endif
+    }
+    
+inline void set_scales_16_iq(const __m256i& all_scales, __m256i * scales) {
+    #ifdef HAVE_FANCY_SIMD
+        auto shuffle = _mm256_set_epi64x(0x0706070607060706, 0x0302030203020302, 0x0504050405040504, 0x0100010001000100);
+        scales[0] = _mm256_shuffle_epi8(all_scales, shuffle);
+        scales[1] = _mm256_shuffle_epi8(all_scales, _mm256_add_epi8(shuffle, _mm256_set1_epi8(8)));
+    #else
+        set_scales_16(all_scales, scales);
+    #endif
+    }
+    
+template <typename Dequantizer>
+static void mul_mat_qX_K_q8_K_IQ_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+        const int nb = n / QK_K;
+        Q8<1> q8(info);
+        Dequantizer deq(vx, bx);
+        __m256i scales[2];
+        __m256i q8_quants[4];
+        for (int ix = 0; ix < nrc_x; ++ix) {
+    
+            __m256 accd = _mm256_setzero_ps();
+            deq.new_row(ix);
+    
+            for (int i = 0; i < nb; ++i) {
+    
+                __m256i sumi[2], all_scales[Dequantizer::num_blocks/8];
+                deq.new_block(i, all_scales);
+    
+                for (int j = 0; j < QK_K/128; ++j) {
+                    deq.prepare(i, j, q8, q8_quants);
+                    if constexpr (Dequantizer::num_blocks == 8) {
+                        set_scales_8_iq(j, all_scales[0], scales);
+                    } else {
+                        set_scales_16_iq(all_scales[j], scales);
+                    }
+                    multiply_add_1(j, deq.bits, scales, q8_quants, sumi);
+                }
+                accd = _mm256_fmadd_ps(_mm256_set1_ps(deq.d*q8.scale(0, i)), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi[0], sumi[1])), accd);
+            }
+    
+            info.store(ix, 0, hsum_float_8(accd));
+        }
+    }
+
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_IQ_N(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    const int nb = n / QK_K;
+    Q8<nrc_y> q8(info);
+    Dequantizer deq(vx, bx);
+    __m256i scales[4];
+    __m256  accd[nrc_y];
+
+    for (int ix = 0; ix < nrc_x; ++ix) {
+
+        for (int iy = 0; iy < nrc_y; ++iy) accd[iy] = _mm256_setzero_ps();
+
+        deq.new_row(ix);
+
+        for (int i = 0; i < nb; ++i) {
+
+            __m256i sumi[nrc_y], all_scales[Dequantizer::num_blocks/8];
+            //for (int iy = 0; iy < nrc_y; ++iy) sumi[iy] = _mm256_setzero_si256();
+            __m256i mins;
+            float dmin = deq.new_block(i, all_scales, mins);
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                auto bsums = q8.load_bsums(iy, i);
+                auto prod  = _mm256_madd_epi16(mins, bsums);
+                accd[iy] = _mm256_fmadd_ps(_mm256_set1_ps(dmin*q8.scale(iy, i)), _mm256_cvtepi32_ps(prod), accd[iy]);
+            }
+
+            for (int j = 0; j < QK_K/128; ++j) {
+                deq.prepare(i, j);
+                if constexpr (Dequantizer::num_blocks == 8) {
+                    set_scales_8(all_scales[0], j, scales);
+                } else {
+                    set_scales_16(all_scales[j], scales);
+                }
+                //multiply_add_iq(deq.bits, scales, j, i, q8, sumi);
+                multiply_add(deq.bits, scales, j, i, q8, sumi);
+            }
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                const __m256 vd = _mm256_set1_ps(deq.d*q8.scale(iy, i));
+                accd[iy] = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi[iy]), accd[iy]);
+            }
+        }
+
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            info.store(ix, iy, hsum_float_8(accd[iy]));
+        }
+    }
+}
+
+template <typename Dequantizer, int nrc_y>
+static void mul_mat_qX_K_q8_K_IQ(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    assert(n % QK_K == 0);
+#ifdef HAVE_FANCY_SIMD
+    if constexpr (nrc_y == 1) {
+        mul_mat_qX_K_q8_K_IQ_1<Dequantizer>(n, vx, bx, info, nrc_x);
+    } else {
+        mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
+    }
+#else
+    mul_mat_qX_K_q8_K_IQ_N<Dequantizer, nrc_y>(n, vx, bx, info, nrc_x);
+#endif
+}
+
+/*
+moonll iq1s
+core func for iq1s mul_mat_iq1_s_q8_K
+
+*/
+
+template <int nrc_y>
+static void mul_mat_iq1_s_q8_K(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    GGML_ASSERT(n%QK_K == 0);
+    Q8<nrc_y, block_q8_K> q8(info);
+    __m256i qx[8];
+    __m256i scales[4];
+    __m256  acc[nrc_y] = {};
+    auto delta_mask = _mm_set1_epi16(-32768); // to avoid stupid overflow warnings when using 0x8000
+    __m256i shuffle0 = _mm256_set_epi64x(0x0302030203020302, 0x0100010001000100, 0x0302030203020302, 0x0100010001000100);
+    for (int ix = 0; ix < nrc_x; ++ix) {
+        auto iq1s = (const block_iq1_s *)((const char *)vx + ix*bx);
+        for (int ibl = 0; ibl < n/QK_K; ++ibl) {
+            float d = GGML_FP16_TO_FP32(iq1s[ibl].d);
+            auto qhb = _mm_loadu_si128((const __m128i *)iq1s[ibl].qh);
+            auto scales128 = _mm_and_si128(_mm_srli_epi16(qhb, 12), _mm_set1_epi16(7));
+            scales128 = _mm_add_epi16(_mm_slli_epi16(scales128, 1), _mm_set1_epi16(1));
+#ifdef HAVE_FANCY_SIMD
+            auto mask = _mm_cmpeq_epi16_mask(_mm_and_si128(qhb, delta_mask), delta_mask);
+            auto deltas128 = _mm_mask_blend_epi16(mask, _mm_set1_epi16(-7), _mm_set1_epi16(-9));
+#else
+            auto mask = _mm_cmpeq_epi16(_mm_and_si128(qhb, delta_mask), delta_mask);
+            auto deltas128 = _mm_or_si128(_mm_and_si128(mask, _mm_set1_epi16(-9)), _mm_andnot_si128(mask, _mm_set1_epi16(-7)));
+#endif
+            deltas128 = _mm_mullo_epi16(scales128, deltas128);
+            scales128 = _mm_slli_epi16(scales128, 3);
+            auto deltas_l = _mm_unpacklo_epi16(deltas128, deltas128);
+            auto deltas_h = _mm_unpackhi_epi16(deltas128, deltas128);
+            auto deltas = MM256_SET_M128I(deltas_h, deltas_l); // blocks 0,0, 1,1, 2,2, ..., 7,7
+            auto all_scales = MM256_SET_M128I(scales128, scales128);
+            auto shuffle = shuffle0;
+            for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+                scales[ib64] = _mm256_shuffle_epi8(all_scales, shuffle);
+                shuffle = _mm256_add_epi8(shuffle, _mm256_set1_epi8(4));
+            }
+            const uint8_t  * qs = iq1s[ibl].qs;
+            const uint16_t * qh = iq1s[ibl].qh;
+            for (int ib = 0; ib < QK_K/32; ib += 2) {
+                qx[ib+0] = _mm256_set_epi64x(iq1s_grid_us[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid_us[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+                                             iq1s_grid_us[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid_us[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+                qx[ib+1] = _mm256_set_epi64x(iq1s_grid_us[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid_us[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+                                             iq1s_grid_us[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid_us[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+                qs += 8;
+            }
+            for (int iy = 0; iy < nrc_y; ++iy) {
+                auto bsums = q8.load_bsums(iy, ibl);
+                auto sumi = _mm256_setzero_si256();
+                for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+                    auto qy1 = q8.load_quants(iy, ibl, 2*ib64+0);
+                    auto qy2 = q8.load_quants(iy, ibl, 2*ib64+1);
+#ifdef HAVE_FANCY_SIMD
+                    auto dot1 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+0], qy1);
+                    auto dot2 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), qx[2*ib64+1], qy2);
+                    sumi = _mm256_dpwssd_epi32(sumi, scales[ib64], _mm256_packs_epi32(dot1, dot2));
+#else
+                    auto dot1 = _mm256_maddubs_epi16(qx[2*ib64+0], qy1);
+                    auto dot2 = _mm256_maddubs_epi16(qx[2*ib64+1], qy2);
+                    auto dot  = _mm256_add_epi16(_mm256_unpacklo_epi64(dot1, dot2), _mm256_unpackhi_epi64(dot1, dot2));
+                    sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(scales[ib64], dot));
+#endif
+                }
+#ifdef HAVE_FANCY_SIMD
+                sumi = _mm256_dpwssd_epi32(sumi, bsums, deltas);
+#else
+                sumi = _mm256_add_epi32(sumi, _mm256_madd_epi16(bsums, deltas));
+#endif
+                acc[iy] = _mm256_fmadd_ps(_mm256_set1_ps(d*q8.scale(iy, ibl)), _mm256_cvtepi32_ps(sumi), acc[iy]);
+            }
+        }
+        for (int iy = 0; iy < nrc_y; ++iy) {
+            info.store(ix, iy, 0.125f*hsum_float_8(acc[iy]));
+            acc[iy] = _mm256_setzero_ps();
+        }
+    }
+}
+
+/*
+moonll iq1s
+DequantizerIQ2XXS
+DequantizerIQ2XXS is important Dequantizer for DequantizerIQ1_S
+*/
+
+struct DequantizerIQ2XXS final : public BaseDequantizer<block_iq2_xxs> {
+    DequantizerIQ2XXS(const void * vx, size_t bx) : BaseDequantizer(vx, bx) {}
+
+    constexpr static int num_blocks = 8;
+
+    union Data {
+        __m256i vec;
+        uint32_t val[8];
+    };
+
+    inline __m128i load_scales(int i) {
+        d = 0.125f * GGML_FP16_TO_FP32(x[i].d);
+        const uint16_t * a16 = (const uint16_t *)x[i].qs;
+        auto scales = _mm_srli_epi16(_mm_set_epi16(a16[31], a16[27], a16[23], a16[19], a16[15], a16[11], a16[7], a16[3]), 12);
+        return _mm_or_si128(_mm_slli_epi16(scales, 1), _mm_set1_epi16(1));
+    }
+
+    inline void new_block(int i, __m256i * scales) {
+        auto sc16 = load_scales(i);
+        scales[0] = MM256_SET_M128I(sc16, sc16);
+    }
+    inline float new_block(int i, __m256i * scales, __m256i& mins) {
+        auto sc16 = load_scales(i);
+        mins = scb.shuffle(sc16);
+        scales[0] = MM256_SET_M128I(sc16, sc16);
+        return -d*minv;
+    }
+
+    inline static void make4(const uint32_t * aux32, __m256i * values) {
+        const uint8_t * aux8 = (const uint8_t *)aux32;
+        values[0] = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[ 1]], iq2xxs_grid[aux8[ 0]]);
+        values[1] = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[ 9]], iq2xxs_grid[aux8[ 8]]);
+        values[2] = _mm256_set_epi64x(iq2xxs_grid[aux8[19]], iq2xxs_grid[aux8[18]], iq2xxs_grid[aux8[17]], iq2xxs_grid[aux8[16]]);
+        values[3] = _mm256_set_epi64x(iq2xxs_grid[aux8[27]], iq2xxs_grid[aux8[26]], iq2xxs_grid[aux8[25]], iq2xxs_grid[aux8[24]]);
+    }
+
+    IQK_ALWAYS_INLINE void sign_values(const uint32_t * aux32, __m256i * values) const {
+#ifdef HAVE_FANCY_SIMD
+        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])), values+0);
+        esh.sign_2_values(MM256_SET_M128I(_mm_set1_epi32(aux32[7]), _mm_set1_epi32(aux32[5])), values+2);
+#else
+        esh.sign_value(aux32[1], values[0]);
+        esh.sign_value(aux32[3], values[1]);
+        esh.sign_value(aux32[5], values[2]);
+        esh.sign_value(aux32[7], values[3]);
+#endif
+    }
+    inline void make4_signed(const uint32_t * aux32, const __m256i& min_value, __m256i * values) const {
+        make4(aux32, values);
+        sign_values(aux32, values);
+        for (int k = 0; k < 4; ++k) values[k] = _mm256_add_epi8(values[k], min_value);
+    }
+    inline void make4(const uint32_t * aux32, __m256i * values, __m256i * q8) const {
+        make4(aux32, values);
+        sign_values(aux32, q8);
+    }
+    inline void prepare(int i, int j) {
+        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
+        make4_signed(data.val, min_value, bits.values);
+    }
+    inline void prepare(int i, int j, const Q8<1>& q8, __m256i * q8_quants) {
+        for (int k = 0; k < 4; ++k) q8_quants[k] = q8.load_quants(0, i, 4*j+k);
+        Data data; data.vec = _mm256_loadu_si256((const __m256i *)x[i].qs + j);
+        make4(data.val, bits.values, q8_quants);
+    }
+
+    constexpr static int minv = 43;
+    SimpleBits bits;
+    Scales8KBase scb;
+    EvenSignHelper esh;
+    const __m256i min_value = _mm256_set1_epi8(minv);
+    const __m256i shuffle = _mm256_set_epi32(7, 5, 3, 1, 7, 5, 3, 1);
+};
+
+/*
+moonll
+add Q8_0_Unpacker && DequantizerIQ2XXS support
+add func mul_mat_qX_K_q8_K_IQ
+*/
+
 template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
-        if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker>) {
+    if constexpr (std::is_same_v<Dequantizer, Q4_0_Unpacker> || std::is_same_v<Dequantizer, Q5_0_Unpacker> ||
+        std::is_same_v<Dequantizer, Q8_0_Unpacker>) {
             m.funcs[0] = mul_mat_qX_0_q8_0_T<Dequantizer, 1>;
             m.funcs[1] = mul_mat_qX_0_q8_0_T<Dequantizer, 2>;
             m.funcs[2] = mul_mat_qX_0_q8_0_T<Dequantizer, 3>;
@@ -1364,7 +2835,7 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
             m.funcs[6] = mul_mat_qX_0_q8_0_T<Dequantizer, 7>;
             m.funcs[7] = mul_mat_qX_0_q8_0_T<Dequantizer, 8>;
         }
-        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>) {
+        else if constexpr (std::is_same_v<Dequantizer, Q4_1_Unpacker> || std::is_same_v<Dequantizer, Q5_1_Unpacker>|| std::is_same_v<Dequantizer, Q8_0_1_Unpacker>) {
             m.funcs[0] = mul_mat_qX_1_q8_1_T<Dequantizer, 1>;
             m.funcs[1] = mul_mat_qX_1_q8_1_T<Dequantizer, 2>;
             m.funcs[2] = mul_mat_qX_1_q8_1_T<Dequantizer, 3>;
@@ -1374,16 +2845,37 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
             m.funcs[6] = mul_mat_qX_1_q8_1_T<Dequantizer, 7>;
             m.funcs[7] = mul_mat_qX_1_q8_1_T<Dequantizer, 8>;
         }
-        else {
+        else if constexpr (std::is_same_v<Dequantizer, DequantizerIQ2XXS>) {
+            m.funcs[0] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 1>;
+            m.funcs[1] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_qX_K_q8_K_IQ<Dequantizer, 8>;
+            }
+            else {
 #ifdef HAVE_FANCY_SIMD
-            m.funcs[0] = mul_mat_qX_K_q8_K_T<Dequantizer, 1>;
-            m.funcs[1] = mul_mat_qX_K_q8_K_T<Dequantizer, 2>;
-            m.funcs[2] = mul_mat_qX_K_q8_K_T<Dequantizer, 3>;
-            m.funcs[3] = mul_mat_qX_K_q8_K_T<Dequantizer, 4>;
-            m.funcs[4] = mul_mat_qX_K_q8_K_T<Dequantizer, 5>;
-            m.funcs[5] = mul_mat_qX_K_q8_K_T<Dequantizer, 6>;
-            m.funcs[6] = mul_mat_qX_K_q8_K_T<Dequantizer, 7>;
-            m.funcs[7] = mul_mat_qX_K_q8_K_T<Dequantizer, 8>;
+            if constexpr (std::is_same_v<Dequantizer, DequantizerIQ4XS>) {
+            m.funcs[0] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 1>;
+            m.funcs[1] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_iqX_k_q8_K_AVX512<Dequantizer, 8>;
+            } else {
+            m.funcs[0] = mul_mat_qX_K_q8_K_AVX512_1<Dequantizer>;
+            m.funcs[1] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 2>;
+            m.funcs[2] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 3>;
+            m.funcs[3] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 4>;
+            m.funcs[4] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 5>;
+            m.funcs[5] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 6>;
+            m.funcs[6] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 7>;
+            m.funcs[7] = mul_mat_qX_K_q8_K_AVX512<Dequantizer, 8>;
+            }
 #else
             if constexpr (std::is_same_v<Dequantizer, DequantizerQ2K> ||
                           std::is_same_v<Dequantizer, DequantizerQ3K> ||
@@ -1410,11 +2902,260 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
         }
 }
 
-bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int) {
+struct QFBase {
+    #ifdef __AVX512F__
+        constexpr static int k_step = 16;
+        using Data = __m512;
+        using Acc  = __m512;
+        static inline Data load(const ggml_half * x) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x)); }
+        static inline Data load(const float * x) { return _mm512_loadu_ps(x); }
+        static inline Data load(const ggml_bf16_t * x) {
+            return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i*)x)), 16));
+        }
+        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
+            return _mm512_fmadd_ps(y, x, prev);
+        }
+        static inline Acc acc_first(const Data& y, const Data& x) {
+            return _mm512_mul_ps(y, x);
+        }
+        static inline Acc add(Acc x, Acc y) { return _mm512_add_ps(x, y); }
+        static inline float hsum(Acc acc) {
+            return _mm512_reduce_add_ps(acc);
+        }
+        template <typename Float>
+        static inline Data load4Floats(const Float * x) {
+            return _mm512_insertf32x4(_mm512_setzero_ps(), load128(x), 0);
+        }
+        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
+            acc = _mm512_fmadd_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00), acc);
+            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
+            auto acc = _mm512_mul_ps(xv[0], _mm512_shuffle_ps(yv, yv, 0x00));
+            acc = _mm512_fmadd_ps(xv[1], _mm512_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm512_fmadd_ps(xv[2], _mm512_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm512_fmadd_ps(xv[3], _mm512_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline __m128 hsum_r4(Acc acc) {
+            auto sum1 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 0), _mm512_extractf32x4_ps(acc, 1));
+            auto sum2 = _mm_add_ps(_mm512_extractf32x4_ps(acc, 2), _mm512_extractf32x4_ps(acc, 3));
+            return _mm_add_ps(sum1, sum2);
+        }
+    #else
+        constexpr static int k_step = 8;
+        using Data = __m256;
+        using Acc  = __m256;
+        static inline Data load(const ggml_half * x) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)x)); }
+        static inline Data load(const float * x) { return _mm256_loadu_ps(x); }
+        static inline Data load(const ggml_bf16_t * x) {
+            return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i*)x)), 16));
+        }
+        static inline Acc acc(Acc prev, const Data& y, const Data& x) {
+            return _mm256_fmadd_ps(y, x, prev);
+        }
+        static inline Acc add(Acc x, Acc y) { return _mm256_add_ps(x, y); }
+        static inline Acc acc_r4(Acc acc, const Data * xv, const Data& yv) {
+            acc = _mm256_fmadd_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00), acc);
+            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_r4_first(const Data * xv, const Data& yv) {
+            auto acc = _mm256_mul_ps(xv[0], _mm256_shuffle_ps(yv, yv, 0x00));
+            acc = _mm256_fmadd_ps(xv[1], _mm256_shuffle_ps(yv, yv, 0x55), acc);
+            acc = _mm256_fmadd_ps(xv[2], _mm256_shuffle_ps(yv, yv, 0xaa), acc);
+            acc = _mm256_fmadd_ps(xv[3], _mm256_shuffle_ps(yv, yv, 0xff), acc);
+            return acc;
+        }
+        static inline Acc acc_first(const Data& y, const Data& x) {
+            return _mm256_mul_ps(y, x);
+        }
+        static inline float hsum(Acc acc) {
+            return hsum_float_8(acc);
+        }
+        static inline __m128 hsum_r4(Acc acc) {
+            return _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
+        }
+        template <typename Float>
+        static inline Data load4Floats(const Float * x) {
+            return _mm256_insertf128_ps(_mm256_setzero_ps(), load128(x), 0);
+        }
+    #endif
+        static inline __m128 load128(const ggml_half * x) { return _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)x)); }
+        static inline __m128 load128(const float * x) { return _mm_loadu_ps(x); }
+        static inline __m128 load128(const ggml_bf16_t * x) {
+            return _mm_castsi128_ps(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)x)), 16));
+        }
+    };
+    template <typename Float, int nrc_in> struct QFT final : public QFBase {
+        constexpr static int nrc = nrc_in;
+        QFT(const DataInfo& info) {
+            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)info.src1_row(iy);
+        }
+        QFT(const char * cx, size_t bx) {
+            for (int iy = 0; iy < nrc; ++iy) y[iy] = (const Float *)(cx + iy*bx);
+        }
+        IQK_ALWAYS_INLINE Data load1(int iy, int i) const { return load(y[iy] + k_step*i); }
+        IQK_ALWAYS_INLINE Data load_tail(int iy, int i) const { return load4Floats(y[iy] + 4*i); }
+        IQK_ALWAYS_INLINE void load_r4(int ix, int i, Data * xv) const {
+            xv[0] = load1(ix+0, i);
+            xv[1] = load1(ix+1, i);
+            xv[2] = load1(ix+2, i);
+            xv[3] = load1(ix+3, i);
+    #ifdef __AVX512F__
+            auto t0 = _mm512_unpacklo_ps(xv[0], xv[1]);
+            auto t1 = _mm512_unpacklo_ps(xv[2], xv[3]);
+            auto t2 = _mm512_unpackhi_ps(xv[0], xv[1]);
+            auto t3 = _mm512_unpackhi_ps(xv[2], xv[3]);
+            xv[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
+            xv[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t0), _mm512_castps_pd(t1)));
+            xv[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
+            xv[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(t2), _mm512_castps_pd(t3)));
+    #else
+            auto t0 = _mm256_unpacklo_ps(xv[0], xv[1]);
+            auto t1 = _mm256_unpacklo_ps(xv[2], xv[3]);
+            auto t2 = _mm256_unpackhi_ps(xv[0], xv[1]);
+            auto t3 = _mm256_unpackhi_ps(xv[2], xv[3]);
+            xv[0] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
+            xv[1] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t0), _mm256_castps_pd(t1)));
+            xv[2] = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
+            xv[3] = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(t2), _mm256_castps_pd(t3)));
+    #endif
+        }
+        const Float * y[nrc];
+    };
+    
 
-    if (ne00 % ggml_blck_size(GGML_TYPE_Q8_K) == 0)
-        row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
 
+template <typename Qy, typename Qx>
+IQK_NOINLINE void mul_mat_Qx_Qy_MxN(int n, const char * cx, size_t bx, int ix0, const DataInfo& info) {
+    int nb = n/QFBase::k_step;
+    int nb4 = n/4;
+    Qy y(info);
+    Qx x(cx + ix0*bx, bx);
+    QFBase::Data xv[Qx::nrc];
+    QFBase::Acc  acc[Qx::nrc*Qy::nrc];
+    auto yv = y.load1(0, 0);
+    for (int ix = 0; ix < Qx::nrc; ++ix) {
+        xv[ix] = x.load1(ix, 0);
+        acc[ix] = QFBase::acc_first(yv, xv[ix]);
+    }
+    for (int iy = 1; iy < Qy::nrc; ++iy) {
+        yv = y.load1(iy, 0);
+        for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc_first(yv, xv[ix]);
+    }
+    for (int i = 1; i < nb; ++i) {
+        yv = y.load1(0, i);
+        for (int ix = 0; ix < Qx::nrc; ++ix) {
+            xv[ix] = x.load1(ix, i);
+            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
+        }
+        for (int iy = 1; iy < Qy::nrc; ++iy) {
+            yv = y.load1(iy, i);
+            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
+        }
+    }
+    for (int i = (QFBase::k_step/4)*nb; i < nb4; ++i) {
+        yv = y.load_tail(0, i);
+        for (int ix = 0; ix < Qx::nrc; ++ix) {
+            xv[ix] = x.load_tail(ix, i);
+            acc[ix] = QFBase::acc(acc[ix], yv, xv[ix]);
+        }
+        for (int iy = 1; iy < Qy::nrc; ++iy) {
+            yv = y.load_tail(iy, i);
+            for (int ix = 0; ix < Qx::nrc; ++ix) acc[Qx::nrc*iy + ix] = QFBase::acc(acc[Qx::nrc*iy + ix], yv, xv[ix]);
+        }
+    }
+    for (int iy = 0; iy < Qy::nrc; ++iy) for (int ix = 0; ix < Qx::nrc; ++ix) info.store(ix0+ix, iy, QFBase::hsum(acc[Qx::nrc*iy+ix]));
+}
+// This will handle any of f16 x f32, f32 x f16, f16 x f16, f32 x f32, with computations done
+// in f32 (i.e., f16 is first converted to f32). It is easy to extend to computations done in
+// f16, but I don't have a CPU capable of f16 vector arithmetic, so not doing it for now.
+template <int nrc_y, typename FloatX, typename FloatY>
+void mul_mat_fX_fY_T(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
+    const char * cx = (const char *)vx;
+    // TBD if we want this
+    //if constexpr (nrc_y == 1) {
+    //    constexpr int k_nx = 2;
+    //    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
+    //        mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
+    //    }
+    //    if (int lastx = k_nx*(nrc_x/k_nx); lastx < nrc_x) {
+    //        int nx = nrc_x - lastx;
+    //        switch (nx) {
+    //            case 1: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info); break;
+    //            case 2: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, lastx, info); break;
+    //            case 3: mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, lastx, info); break;
+    //        }
+    //        //mul_mat_Qx_Qy_Mx1<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, lastx, info);
+    //    }
+    //    return;
+    //}
+#ifdef __AVX512F__
+    constexpr int k_nx = 5;
+#else
+    constexpr int k_nx = nrc_y == 1 ? 4 : 2;
+#endif
+    for (int ix = 0; ix < nrc_x/k_nx; ++ix) {
+        mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, k_nx>>(n, cx, bx, ix*k_nx, info);
+    }
+    int last_x = k_nx*(nrc_x/k_nx);
+    if (last_x == nrc_x) return;
+    int nx = nrc_x - last_x;
+#ifdef __AVX512F__
+    switch (nx) {
+        case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+        case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
+        case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
+        case 4: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 4>>(n, cx, bx, last_x, info); break;
+    }
+#else
+    if constexpr (nrc_y == 1) {
+        switch (nx) {
+            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+            case 2: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 2>>(n, cx, bx, last_x, info); break;
+            case 3: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 3>>(n, cx, bx, last_x, info); break;
+        }
+    } else {
+        switch (nx) {
+            case 1: mul_mat_Qx_Qy_MxN<QFT<FloatY, nrc_y>, QFT<FloatX, 1>>(n, cx, bx, last_x, info); break;
+        }
+    }
+#endif
+}
+
+template <typename FloatX, typename FloatY>
+void set_mul_mat_f(MulMat& mm) {
+    for (auto& f : mm.funcs) f = nullptr;
+    mm.funcs[0] = mul_mat_fX_fY_T<1, FloatX, FloatY>;
+    mm.funcs[1] = mul_mat_fX_fY_T<2, FloatX, FloatY>;
+    mm.funcs[2] = mul_mat_fX_fY_T<3, FloatX, FloatY>;
+    mm.funcs[3] = mul_mat_fX_fY_T<4, FloatX, FloatY>;
+    mm.funcs[4] = mul_mat_fX_fY_T<5, FloatX, FloatY>;
+#ifndef __AVX512F__
+    mm.funcs[5] = mul_mat_fX_fY_T<6, FloatX, FloatY>;
+#endif
+}
+
+
+
+/*
+moonll
+add typeb TO compare return not expected type of weight matrix
+add IQ2XSS
+add IQ1_S
+add GGML_TYPE_IQ4_XS
+*/
+
+bool MulMat::set_mul_mat(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
+    (void)Ny;
+
+        auto expected_typeB = GGML_TYPE_Q8_K;
     switch (typeA) {
         case GGML_TYPE_Q2_K:
             assert (ne00 % QK_K == 0);
@@ -1440,37 +3181,75 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int)
             assert (ne00 % QK_K == 0);
             MulMat::set_functions<DequantizerIQ4XS>(mm);
             break;
+        case GGML_TYPE_IQ2_XXS:
+            assert (ne00 % QK_K == 0);
+            MulMat::set_functions<DequantizerIQ2XXS>(mm);
+            break;
         case GGML_TYPE_Q4_0:
             assert (ne00 % QK4_0 == 0);
             MulMat::set_functions<Q4_0_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
+            expected_typeB = GGML_TYPE_Q8_0;
             break;
         case GGML_TYPE_Q4_1:
             assert (ne00 % QK4_1 == 0);
             MulMat::set_functions<Q4_1_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
             break;
         case GGML_TYPE_Q5_0:
             assert (ne00 % QK5_0 == 0);
             MulMat::set_functions<Q5_0_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_0, ne00);
+            expected_typeB = GGML_TYPE_Q8_0;
             break;
         case GGML_TYPE_Q5_1:
             assert (ne00 % QK5_1 == 0);
             MulMat::set_functions<Q5_1_Unpacker>(mm);
-            row_size_q8 = ggml_row_size(GGML_TYPE_Q8_1, ne00);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
+            break;
+        case GGML_TYPE_Q8_0:
+            assert (ne00 % QK8_0 == 0);
+#ifdef HAVE_FANCY_SIMD
+            MulMat::set_functions<Q8_0_1_Unpacker>(mm);
+            expected_typeB = GGML_TYPE_Q8_1_X4;
+#else
+            MulMat::set_functions<Q8_0_Unpacker>(mm);
+            expected_typeB = GGML_TYPE_Q8_0_X4;
+#endif
+            break;
+        case GGML_TYPE_IQ1_S:
+            mm.funcs[0] = mul_mat_iq1_s_q8_K<1>;
+            mm.funcs[1] = mul_mat_iq1_s_q8_K<2>;
+            mm.funcs[2] = mul_mat_iq1_s_q8_K<3>;
+            mm.funcs[3] = mul_mat_iq1_s_q8_K<4>;
+            mm.funcs[4] = mul_mat_iq1_s_q8_K<5>;
+            mm.funcs[5] = mul_mat_iq1_s_q8_K<6>;
+            mm.funcs[6] = mul_mat_iq1_s_q8_K<7>;
+            mm.funcs[7] = mul_mat_iq1_s_q8_K<8>;
+        #ifdef HAVE_FANCY_SIMD
+             mm.func16 = mul_mat_iq1_s_q8_K<16>;
+        #endif
+       // row_size_q8 = ggml_row_size(GGML_TYPE_Q8_K, ne00);
+              expected_typeB = GGML_TYPE_Q8_K;
             break;
 
         default:
+        {
+            printf("case:%d",typeA);
             return false;
+        }
+            
     }
 
-    return true;
+
+
+    return ggml_type(typeB) == expected_typeB;
+
 }
 
 } // namespace
 
-
+/*
+iq1_s is not support for arm
+*/
 #else   // __aarch64__
 
 namespace {
diff --git a/third_party/llamafile/sgemm.h b/third_party/llamafile/sgemm.h
index 09c47d8..abad1ba 100644
--- a/third_party/llamafile/sgemm.h
+++ b/third_party/llamafile/sgemm.h
@@ -12,10 +12,15 @@ extern "C" {
 
 struct ggml_tensor;
 struct ggml_compute_params;
+/*moonll old
+add more params typeb...
+*/
+
+
+bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
 
-bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
-bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
-bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);
 
 bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
 bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
diff --git a/third_party/llamafile/tinyblas_cpu_sgemm.inc b/third_party/llamafile/tinyblas_cpu_sgemm.inc
index 634dc3e..9ed8f35 100644
--- a/third_party/llamafile/tinyblas_cpu_sgemm.inc
+++ b/third_party/llamafile/tinyblas_cpu_sgemm.inc
@@ -323,20 +323,17 @@ bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void
 #if QK_K == 256
 #if defined(__x86_64__) || defined(_M_X64)
 #if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
-    // if (X86_CHECK(AVX2) && X86_CHECK(FMA)) {
-    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
-        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
+    /* 
+    moonll
+    more Btype accept
+    }*/
+
+    if (Ctype == GGML_TYPE_F32){
+        if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
             return true;
         }
     }
-    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
-        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
-        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
-        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
-            return true;
-        }
-    }
-    // }
+
 #endif
 #elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
     if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {