update kt-kernel

2026-03-14 18:37:23 +00:00 · 2025-10-24 14:32:45 +08:00
parent 28d8663374
commit da4c68c7a0
40 changed files with 10612 additions and 3 deletions
--- a/kt-kernel/.githooks/commit-msg
+++ b/kt-kernel/.githooks/commit-msg
@@ -0,0 +1,54 @@
+#!/bin/sh
+# commit-msg hook to enforce Conventional Commits (https://www.conventionalcommits.org/)
+# This script checks the commit message subject (first line) for a conventional commit format.
+# If the message does not conform, the hook exits non-zero to block the commit.
+
+# Read the commit message (first line)
+if [ -z "$1" ]; then
+  echo "commit-msg hook: no message file provided" >&2
+  exit 0
+fi
+
+MSG_FILE="$1"
+read -r FIRST_LINE < "$MSG_FILE" || FIRST_LINE=""
+
+# Trim leading/trailing whitespace
+FIRST_LINE="$(echo "$FIRST_LINE" | sed -e 's/^[ \t]*//' -e 's/[ \t]*$//')"
+
+# Allow empty message (let git handle it), or allow merges/reverts
+case "$FIRST_LINE" in
+  Merge:*|merge:*|Revert:*|revert:*)
+    exit 0
+    ;;
+esac
+
+# Conventional Commit regex (POSIX ERE)
+# [type](scope)!?: subject
+# types: feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
+# scope: any chars except )
+
+regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
+
+printf "%s" "$FIRST_LINE" | grep -E "$regex" >/dev/null 2>&1
+if [ $? -eq 0 ]; then
+  exit 0
+fi
+
+cat <<'EOF' >&2
+ERROR: Commit message does not follow Conventional Commits.
+
+Expected format:
+  [type](scope)?: subject
+
+Examples:
+  [feat]: add new feature
+  [fix(parser)]: handle edge case
+  [docs]!: update API docs (breaking change)
+
+Allowed types: feat, fix, docs, style, refactor, perf, test, build, ci, chore, revert, wip
+
+You can bypass this hook locally by running:
+  git commit --no-verify
+EOF
+
+exit 1
--- a/kt-kernel/.githooks/pre-commit
+++ b/kt-kernel/.githooks/pre-commit
@@ -0,0 +1,63 @@
+#!/usr/bin/bash
+# Pre-commit hook: run clang-format via CMake 'format' target and Black for Python before allowing commit.
+# If formatting makes changes, stage them and abort so user can review.
+set -euo pipefail
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+BUILD_DIR="$REPO_ROOT/build"
+FORMAT_TARGET="format"
+CLANG_FORMAT_BIN="${CLANG_FORMAT_BIN:-clang-format}"
+BLACK_BIN="${BLACK_BIN:-black}"
+
+# Simple check clang-format present (optional)
+# clang-format optional: if missing, skip C/C++ formatting
+if ! command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1; then
+  echo "[pre-commit] clang-format not found (looked for $CLANG_FORMAT_BIN). Skipping C/C++ format." >&2
+fi
+
+# black optional: if missing, skip Python formatting
+if ! command -v "$BLACK_BIN" >/dev/null 2>&1; then
+  echo "[pre-commit] black not found (looked for $BLACK_BIN). Skipping Python format." >&2
+fi
+
+# Configure build directory if missing (quiet)
+if [ ! -d "$BUILD_DIR" ] || [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; then
+  echo "[pre-commit] configuring project (cmake) ..." >&2
+  cmake -S "$REPO_ROOT" -B "$BUILD_DIR" >/dev/null
+fi
+
+# Run format target (prefer ninja if present)
+# Run clang-format target when available and tool present
+if command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1; then
+  if [ -f "$BUILD_DIR/build.ninja" ]; then
+    (cd "$BUILD_DIR" && ninja -k0 "$FORMAT_TARGET" >/dev/null)
+  else
+    (cd "$BUILD_DIR" && make "$FORMAT_TARGET")
+  fi
+fi
+
+# Run black on staged python files (or entire repo if you prefer)
+if command -v "$BLACK_BIN" >/dev/null 2>&1; then
+  # Get staged python files; if none, skip
+  PY_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep -E '\.py$' || true)
+  if [ -n "$PY_FILES" ]; then
+    echo "[pre-commit] running black on staged python files..." >&2
+    $BLACK_BIN $PY_FILES
+  else
+    # Optionally format all python files; comment out if not desired
+    # $BLACK_BIN "$REPO_ROOT"
+    :
+  fi
+fi
+
+# Stage any formatting changes for tracked files
+if ! git diff --quiet --exit-code; then
+  echo "[pre-commit] Formatting applied; updating index." >&2
+  # Add only modified tracked files (exclude untracked new files not staged yet unless user staged them)
+  git add -u
+  echo "[pre-commit] Re-run git commit to proceed after reviewing changes." >&2
+  exit 1
+fi
+
+echo "[pre-commit] format OK." >&2
+exit 0
--- a/kt-kernel/.gitmodules
+++ b/kt-kernel/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
+[submodule "llama.cpp"]
+	path = third_party/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -11,16 +11,34 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo

 ## Installation

+### Prerequisites
+
+First, initialize git submodules:
+```bash
+git submodule update --init --recursive
+```
+
 ### Standard Installation
 ```bash
 pip install .
 ```

+All dependencies (torch, safetensors, compressed-tensors, numpy) will be automatically installed from `pyproject.toml`.
+
 ### Editable Installation (Development)
 ```bash
 pip install -e .
 ```

+### Optional: Pre-install Dependencies
+
+If you encounter network issues or prefer to install dependencies separately, you can optionally use:
+```bash
+pip install -r requirements.txt
+```
+
+**Note**: This step is **optional**. If your environment already has torch and other required packages, you can skip this and directly run `pip install .`
+
 ## Usage

 ```python
--- a/kt-kernel/pyproject.toml
+++ b/kt-kernel/pyproject.toml
@@ -19,10 +19,13 @@ classifiers = [
 ]
 requires-python = ">=3.8"
 dependencies = [
-  # Install black by default so git hook can use it
+  # Core dependencies
+  "torch>=2.0.0",
+  "safetensors>=0.4.0",
+  "compressed-tensors>=0.7.0",
+  "numpy>=1.24.0",
+  # Development dependencies
  "black>=25.9.0",
-  "torch",
-  "safetensors",
 ]

 # No optional dev group needed for formatting; using custom git hooks instead of pre-commit
--- a/kt-kernel/requirements.txt
+++ b/kt-kernel/requirements.txt
@@ -0,0 +1,12 @@
+# Optional: Install these if not already available in your environment
+# These dependencies will be automatically installed when running `pip install .`
+# You can skip this file if you already have these packages installed
+
+# Core dependencies (minimum versions)
+torch>=2.0.0
+safetensors>=0.4.0
+compressed-tensors>=0.7.0
+numpy>=1.24.0
+
+# Development dependencies
+black>=25.9.0
--- a/kt-kernel/third_party/llamafile/README.md
+++ b/kt-kernel/third_party/llamafile/README.md
@@ -0,0 +1 @@
+The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.
--- a/kt-kernel/third_party/llamafile/bench.h
+++ b/kt-kernel/third_party/llamafile/bench.h
@@ -0,0 +1,25 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+#pragma once
+
+#include <stdio.h>
+
+#include "micros.h"
+
+#define BENCH(x)                                                                       \
+    do {                                                                               \
+        x;                                                                             \
+        __asm__ volatile("" ::: "memory");                                             \
+        long long start = micros();                                                    \
+        for (int i = 0; i < ITERATIONS; ++i) {                                         \
+            __asm__ volatile("" ::: "memory");                                         \
+            x;                                                                         \
+            __asm__ volatile("" ::: "memory");                                         \
+        }                                                                              \
+        printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
+    } while (0)
--- a/kt-kernel/third_party/llamafile/flags.cpp
+++ b/kt-kernel/third_party/llamafile/flags.cpp
@@ -0,0 +1,8 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#include "flags.h"
+
+bool FLAG_precise = false;
--- a/kt-kernel/third_party/llamafile/flags.h
+++ b/kt-kernel/third_party/llamafile/flags.h
@@ -0,0 +1,8 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#pragma once
+
+extern bool FLAG_precise;
--- a/kt-kernel/third_party/llamafile/iqk_mul_mat.inc
+++ b/kt-kernel/third_party/llamafile/iqk_mul_mat.inc
--- a/kt-kernel/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+++ b/kt-kernel/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
@@ -0,0 +1,8 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
+// Copyrigth 2024 Iwan Kawrakow.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#include "iqk_mul_mat.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+++ b/kt-kernel/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
@@ -0,0 +1,10 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
+// Copyrigth 2024 Iwan Kawrakow.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define iqk_mul_mat iqk_mul_mat_zen4
+#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
+#include "iqk_mul_mat.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/iqk_mul_mat_arm.inc
+++ b/kt-kernel/third_party/llamafile/iqk_mul_mat_arm.inc
--- a/kt-kernel/third_party/llamafile/iqk_mul_mat_arm82.cpp
+++ b/kt-kernel/third_party/llamafile/iqk_mul_mat_arm82.cpp
@@ -0,0 +1,10 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp
+// Copyrigth 2024 Iwan Kawrakow.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#ifdef __aarch64__
+#define iqk_mul_mat iqk_mul_mat_arm82
+#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
+#include "iqk_mul_mat_arm.inc"
+#endif  // __aarch64__
--- a/kt-kernel/third_party/llamafile/macros.h
+++ b/kt-kernel/third_party/llamafile/macros.h
@@ -0,0 +1,14 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+#pragma once
+
+#define MIN(X, Y) ((Y) > (X) ? (X) : (Y))
+#define MAX(X, Y) ((Y) < (X) ? (X) : (Y))
+#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
+#define ROUNDUP(X, K) (((X) + (K) - 1) & -(K))
+#define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))
--- a/kt-kernel/third_party/llamafile/micros.h
+++ b/kt-kernel/third_party/llamafile/micros.h
@@ -0,0 +1,41 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+#pragma once
+
+#include <ctime>
+
+#ifndef _WIN32
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif
+
+#ifdef _WIN32
+static long long GetQueryPerformanceFrequency() {
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency(&t);
+    return t.QuadPart;
+}
+static long long GetQueryPerformanceCounter() {
+    LARGE_INTEGER t;
+    QueryPerformanceCounter(&t);
+    return t.QuadPart;
+}
+#endif
+
+static long long micros(void) {
+#ifndef _WIN32
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
+#else
+    static long long timer_freq = GetQueryPerformanceFrequency();
+    static long long timer_start = GetQueryPerformanceCounter();
+    return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq;
+#endif
+}
--- a/kt-kernel/third_party/llamafile/numba.h
+++ b/kt-kernel/third_party/llamafile/numba.h
@@ -0,0 +1,59 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#pragma once
+
+inline int rand32(void) {
+    static unsigned long long lcg = 1;
+    lcg *= 6364136223846793005;
+    lcg += 1442695040888963407;
+    return lcg >> 32;
+}
+
+inline int popcount(unsigned x) {
+    x = x - ((x >> 1) & 0x55555555);
+    x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
+    x = (x + (x >> 4)) & 0x0F0F0F0F;
+    x = (x + (x >> 16));
+    return (x + (x >> 8)) & 0x0000003F;
+}
+
+inline int hamming(int x, int y) {
+    return popcount(x ^ y);
+}
+
+inline float float01(unsigned x) {  // (0,1)
+    return 1.f / 8388608 * ((x >> 9) + .5f);
+}
+
+inline float numba(void) {  // (-10,10)
+    return float01(rand32()) * 2.f - 1.f;
+}
+
+template <typename T>
+void randomize(T* A, int n) {
+    for (int i = 0; i < n; ++i)
+        A[i] = numba();
+}
+
+template <typename T>
+void randomize(int m, int n, T* A, int lda) {
+    for (int j = 0; j < n; ++j)
+        for (int i = 0; i < m; ++i)
+            A[lda * j + i] = numba();
+}
+
+template <typename T, typename U>
+void broadcast(T* A, int n, U x) {
+    for (int i = 0; i < n; ++i)
+        A[i] = x;
+}
+
+template <typename T, typename U>
+void broadcast(int m, int n, T* A, int lda, U x) {
+    for (int j = 0; j < n; ++j)
+        for (int i = 0; i < m; ++i)
+            A[lda * j + i] = x;
+}
--- a/kt-kernel/third_party/llamafile/sgemm.cpp
+++ b/kt-kernel/third_party/llamafile/sgemm.cpp
@@ -0,0 +1,204 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sgemm.h"
+// #include <cosmo.h>
+// #include <cpuid.h>
+// #include <libc/sysv/consts/hwcap.h>
+#include <stdio.h>
+// #include <sys/auxv.h>
+#include <cassert>
+// #include "llamafile.h"
+
+static const struct GemmFuncs {
+    bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+    bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+    bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+    // typeof(llamafile_sgemm)* sgemm;
+    // typeof(llamafile_mixmul)* mixmul;
+    // typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
+    GemmFuncs() {
+#if defined(__x86_64__) || defined(_M_X64)
+        // if (X86_HAVE(AVX)) {
+        //     if (X86_HAVE(FMA)) {
+        //         if (X86_HAVE(AVX2)) {
+        //             if (X86_HAVE(AVX512F)) {
+        //                 if (X86_HAVE(AVX512VL) &&     //
+        //                     X86_HAVE(AVX512BW) &&     //
+        //                     X86_HAVE(AVX512DQ) &&     //
+        //                     X86_HAVE(AVX512_VNNI) &&  //
+        //                     X86_HAVE(AVX512_BF16)) {
+        //                     // AMD Zen4+ (2023-)
+        //                     sgemm = llamafile_sgemm_amd_zen4;
+        //                     mixmul = llamafile_mixmul_amd_zen4;
+        //                     iqk_mixmul = iqk_mul_mat_moe_zen4;
+        //                 } else {
+        //                     // Intel Xeon Skylake+ (2015-)
+        //                     sgemm = llamafile_sgemm_amd_avx512f;
+        //                     mixmul = llamafile_mixmul_amd_avx512f;
+        //                     iqk_mixmul = iqk_mul_mat_moe;
+        //                 }
+        //             } else if (X86_HAVE(AVXVNNI)) {
+        //                 // Intel Alderlake (2021-)
+        //                 sgemm = llamafile_sgemm_amd_avxvnni;
+        //                 mixmul = llamafile_mixmul_amd_avxvnni;
+        //                 iqk_mixmul = iqk_mul_mat_moe;
+        //             } else {
+        //                 // Intel Haswell/Broadwell/Skylake (2013-2020)
+        //                 // AMD Excavator (2015-2022)
+        //                 sgemm = llamafile_sgemm_amd_avx2;
+        //                 mixmul = llamafile_mixmul_amd_avx2;
+        //                 if (X86_HAVE(F16C))
+        //                     iqk_mixmul = iqk_mul_mat_moe;
+        //             }
+        //         } else {
+        //             // AMD Piledriver (2011-2014)
+        //             sgemm = llamafile_sgemm_amd_fma;
+        //             mixmul = llamafile_mixmul_amd_fma;
+        //             if (X86_HAVE(F16C))
+        //                 iqk_mixmul = iqk_mul_mat_moe;
+        //         }
+        //     } else {
+        //         // Intel Sandybridge/Ivybridge (2010-2012)
+        //         // AMD Bulldozer (2011)
+        //         sgemm = llamafile_sgemm_amd_avx;
+        //         mixmul = llamafile_mixmul_amd_avx;
+        //     }
+        // } else {
+        //     // AMD K8/Barcelona (2003-2010)
+        //     // Intel Core/Nehalem (2006-2009)
+        //     sgemm = llamafile_sgemm_unsupported;
+        //     mixmul = llamafile_mixmul_unsupported;
+        // }
+
+#if defined(__AVX__)
+#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
+#if defined(__AVX2__)
+#if defined(__AVX512F__)
+#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
+        // AMD Zen4+ (2023-)
+        sgemm = llamafile_sgemm_amd_zen4;
+        mixmul = llamafile_mixmul_amd_zen4;
+        iqk_mixmul = iqk_mul_mat_moe_zen4;
+#else
+        // Intel Xeon Skylake+ (2015-)
+        sgemm = llamafile_sgemm_amd_avx512f;
+        mixmul = llamafile_mixmul_amd_avx512f;
+        iqk_mixmul = iqk_mul_mat_moe;
+#endif
+#elif defined(__AVXVNNI__)
+        // Intel Alderlake (2021-)
+        sgemm = llamafile_sgemm_amd_avxvnni;
+        mixmul = llamafile_mixmul_amd_avxvnni;
+        iqk_mixmul = iqk_mul_mat_moe;
+#else
+        // Intel Haswell/Broadwell/Skylake (2013-2020)
+        // AMD Excavator (2015-2022)
+        sgemm = llamafile_sgemm_amd_avx2;
+        mixmul = llamafile_mixmul_amd_avx2;
+#if defined(__F16C__)
+        iqk_mixmul = iqk_mul_mat_moe;
+#endif
+#endif
+#else
+        // AMD Piledriver (2011-2014)
+        sgemm = llamafile_sgemm_amd_fma;
+        mixmul = llamafile_mixmul_amd_fma;
+#if defined(__F16C__)
+        iqk_mixmul = iqk_mul_mat_moe;
+#endif
+#endif
+#else
+        // Intel Sandybridge/Ivybridge (2010-2012)
+        // AMD Bulldozer (2011)
+        sgemm = llamafile_sgemm_amd_avx;
+        mixmul = llamafile_mixmul_amd_avx;
+#endif
+#else
+        // AMD K8/Barcelona (2003-2010)
+        // Intel Core/Nehalem (2006-2009)
+        sgemm = llamafile_sgemm_unsupported;
+        mixmul = llamafile_mixmul_unsupported;
+#endif
+
+#elif defined(__aarch64__)
+        // long hwcap = getauxval(AT_HWCAP);
+        // if ((hwcap & HWCAP_FPHP) &&     // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
+        //     (hwcap & HWCAP_ASIMDHP) &&  // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
+        //     (hwcap & HWCAP_ASIMDDP)) {  // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
+        //     // e.g. Apple M1, Raspberry Pi 5
+            sgemm = llamafile_sgemm_arm82;
+            mixmul = llamafile_mixmul_arm82;
+            iqk_mixmul = iqk_mul_mat_moe_arm82;
+        // } else {
+            // ARM64 baseline ISA
+        //     sgemm = llamafile_sgemm_arm80;
+        //     mixmul = llamafile_mixmul_arm80;
+        // }
+#else
+        sgemm = llamafile_sgemm_unsupported;
+        mixmul = llamafile_mixmul_unsupported;
+#endif
+    }
+} funcs;
+
+/**
+ * Performs optimized matrix multiplication on CPU.
+ *
+ * This subroutine may compute C = Aᵀ * B with column major ordering.
+ * Despite its name, this isn't a generalized implementation. Work is
+ * only performed when a handwritten kernel is written and available.
+ * Otherwise the caller should fall back to a general matmul routine.
+ *
+ * @param m is rows in `A` and `C`
+ * @param n is cols in `B` and `C`
+ * @param k is cols in `A` and rows in `B`
+ * @param A is first input matrix (always transposed)
+ * @param lda is row stride of `A`
+ * @param B is second input matrix (never transposed)
+ * @param ldb is row stride of `B`
+ * @param C is input/output array of output matrices
+ * @param ldc is row stride of `C`
+ * @param ith is thread id (must be less than `nth`)
+ * @param nth is number of threads (must be greater than zero)
+ * @param task is GGML task type
+ * @param Atype is GGML data type of `A`
+ * @param Btype is GGML data type of `B`
+ * @param Ctype is GGML data type of `C`
+ * @param precision may be used to control the internal compute type
+ * @return true if this function was able to service the matmul request
+ */
+bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
+    return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype,
+                       precision);
+}
+
+/**
+ * Performs "mixture of experts" tensor multiplication on CPU.
+ */
+bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
+    return funcs.mixmul(params, weights, thought, plan, result);
+}
+
+bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) {
+    return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth);
+}
--- a/kt-kernel/third_party/llamafile/sgemm.h
+++ b/kt-kernel/third_party/llamafile/sgemm.h
@@ -0,0 +1,92 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.h
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#pragma once
+#include <stdbool.h>
+#include <cstddef>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_tensor;
+struct ggml_compute_params;
+#ifdef __aarch64__
+
+bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
+bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
+bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);
+
+bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+
+bool llamafile_sgemm(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);
+
+bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+
+bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+
+#else
+
+bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
+
+
+bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+
+bool llamafile_sgemm(long m, long n, long k, const void* a, long lda, const void* b, long ldb, void* c, long ldc, int ith, int nth, int task_type, int a_type, int b_type, int c_type, int precision);
+bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);
+
+bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
+
+bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
+bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu.h
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu.h
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul.inc
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul.inc
@@ -0,0 +1,411 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul.inc
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tinyblas_cpu.h"
+
+//
+//
+//                                ██████╗ ██╗   █████╗ ██████╗
+//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
+//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
+//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
+//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
+//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
+//
+//               MIXTURE OF EXPERTS TENSOR MULTIPLICATION
+//
+//
+// SHAPES
+//
+//   - weights [cols, rows, experts]
+//   - thought [cols, tasks, tokens] w/ tasks ≤ thinkers
+//   - result  [rows, thinkers, tokens] w/ thinkers ≤ experts
+//   - plan    [thinkers, tokens] w/ i32 < experts
+//
+// DEFINITION
+//
+//   for thinker in range(thinkers):
+//     for token in range(tokens):
+//       for row in range(rows):
+//         c = 0
+//         for col in range(cols):
+//           expert = plan[token][thinker]
+//           a = weights[expert][row][col]
+//           b = thought[token][thinker % tasks][col]
+//           c += a * b
+//         result[token][thinker][row] = c
+//
+// REGULARITIES
+//
+//   - tokens can be odd
+//   - thinkers is usually 2
+//   - tasks is usually 1 or 2
+//   - cols should be a multiple of 64
+//   - rows should be a multiple of 64
+//   - experts is usually 8 but could be 60
+//   - tokens is always 1 for token generation
+//   - tokens can be huge for prompt processing
+//
+// EXAMPLE
+//
+//   mixtral 8x7b w/ 217 token prompt
+//
+//           |  ne*0 ne*1 ne*2 ne*3 | nb*0    nb*1      nb*2       nb*3 | type
+//   =========================================================================
+//   weights | 16384 6144    8    1 |   18  0x2400 0x3600000 0x1b000000 | q4_0
+//   thought | 16384    2  217    1 |    4 0x10000   0x20000  0x1b20000 | f32
+//   result  |  6144    2  217    1 |    4  0x6000    0xc000   0xa2c000 | f32
+//   plan    |     2  217    1    1 |    4    0x20    0x1b20     0x1b20 | i32
+//
+
+namespace {
+
+class MixMul {
+   public:
+    MixMul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result)
+        : params(params),
+          weights(weights),
+          thought(thought),
+          plan(plan),
+          result(result),
+          rows(weights->ne[1]),
+          cols(weights->ne[0]),
+          experts(weights->ne[2]),
+          thinkers(plan->ne[0]),
+          tasks(thought->ne[1]),
+          tokens(thought->ne[2]),
+          ldq((cols * 2 + ROW_ALIGN - 1) & -ROW_ALIGN),
+          wdata_((char*)(((uintptr_t)params->wdata + MAX_ALIGN - 1) & -MAX_ALIGN)),
+          allocated_(0) {
+    }
+
+    bool allocate_shared_memory() {
+        if (!(quantized_thought_ = allocate<char>(MATRIX_ALIGN, tokens * tasks * ldq)))
+            return false;
+        if (!(rowptr_result_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
+            return false;
+        if (!(rowptr_thought_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
+            return false;
+        if (!(rowptr_count_ = allocate<long>(sizeof(long), experts)))
+            return false;
+        return true;
+    }
+
+    size_t get_allocated_bytes() {
+        return (wdata_ - (char*)params->wdata) + allocated_;
+    }
+
+    bool mixmul() {
+        // invariants
+        assert(tasks <= thinkers);
+        assert(thinkers <= experts);
+        assert(tokens == plan->ne[1]);
+        assert(rows == result->ne[0]);
+        assert(cols == thought->ne[0]);
+        assert(tokens == result->ne[2]);
+        assert(thinkers == result->ne[1]);
+
+        // dimensionality
+        assert(plan->ne[2] == 1);
+        assert(plan->ne[3] == 1);
+        assert(result->ne[3] == 1);
+        assert(weights->ne[3] == 1);
+        assert(thought->ne[3] == 1);
+
+        // miscellaneous
+        assert(params->nth > 0);
+        assert(params->ith < params->nth);
+        assert(plan->type == GGML_TYPE_I32);
+
+        // check nb01 is convertible to lda
+        if (weights->nb[1] % ggml_type_size(weights->type))
+            return false;
+
+        // no support for column strides
+        if (result->nb[0] != ggml_type_size(result->type))
+            return false;
+        if (thought->nb[0] != ggml_type_size(thought->type))
+            return false;
+        if (weights->nb[0] != ggml_type_size(weights->type))
+            return false;
+
+        // supported output types
+        switch (result->type) {
+            case GGML_TYPE_F32:
+                return mixmuler<float>();
+            default:
+                return false;
+        }
+    }
+
+   private:
+    template <typename TC>
+    bool mixmuler() {
+        switch (weights->type) {
+            case GGML_TYPE_F32:
+                if (thought->type != GGML_TYPE_F32)
+                    return false;
+#if defined(__AVX512F__)
+                return mixmat<16, 1, tinyBLAS<NCB | NCC, 16, __m512, __m512, float, float, TC>, float,
+                              float, TC>();
+#elif defined(__AVX__) || defined(__AVX2__)
+                return mixmat<8, 1, tinyBLAS<NCB | NCC, 8, __m256, __m256, float, float, TC>, float,
+                              float, TC>();
+#elif defined(__SSE__)
+                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, __m128, __m128, float, float, TC>, float,
+                              float, TC>();
+#elif defined(__ARM_NEON)
+                return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, float, float, TC>,
+                              float, float, TC>();
+#else
+                return false;
+#endif
+
+            case GGML_TYPE_BF16:
+                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_BF16)
+                    return false;
+#if defined(__AVX512BF16__)
+                if (!FLAG_precise) {
+                    return mixmat<
+                        32, 1, tinyBLAS<NCB | NCC, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC>,
+                        ggml_bf16_t, ggml_bf16_t, TC>();
+                } else {
+                    return mixmat<16, 1,
+                                  tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
+                                  ggml_bf16_t, ggml_bf16_t, TC>();
+                }
+#elif defined(__AVX512F__)
+                return mixmat<16, 1,
+                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
+                              ggml_bf16_t, ggml_bf16_t, TC>();
+#elif defined(__AVX2__)
+                return mixmat<8, 1,
+                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, TC>,
+                              ggml_bf16_t, ggml_bf16_t, TC>();
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+                return mixmat<
+                    4, 1,
+                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_bf16_t, ggml_bf16_t, TC>,
+                    ggml_bf16_t, ggml_bf16_t, TC>();
+#else
+                return false;
+#endif
+
+            case GGML_TYPE_F16:
+                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_F16)
+                    return false;
+#if defined(__AVX512F__)
+                return mixmat<16, 1,
+                              tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC>,
+                              ggml_fp16_t, ggml_fp16_t, TC>();
+#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
+                // if (X86_CHECK(F16C)) {
+                return mixmat<8, 1,
+                              tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC>,
+                              ggml_fp16_t, ggml_fp16_t, TC>();
+                // } else {
+                //     return false;
+                // }
+#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+                if (result->op_params[0] == GGML_PREC_F32) {
+                    return mixmat<
+                        4, 1,
+                        tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
+                        ggml_fp16_t, ggml_fp16_t, TC>();
+                } else {
+                    return mixmat<
+                        8, 1,
+                        tinyBLAS<NCB | NCC, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC>,
+                        ggml_fp16_t, ggml_fp16_t, TC>();
+                }
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+                return mixmat<
+                    4, 1,
+                    tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
+                    ggml_fp16_t, ggml_fp16_t, TC>();
+#else
+                return false;
+#endif
+
+            case GGML_TYPE_Q4_0:
+                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
+                    return false;
+#if defined(__AVX2__) || defined(__AVX512F__)
+                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q4_0, block_q8_0, TC>,
+                              block_q4_0, block_q8_0, TC>();
+#elif defined(__ARM_FEATURE_DOTPROD)
+                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q4_0, block_q8_0, TC>,
+                              block_q4_0, block_q8_0, TC>();
+#else
+                return false;
+#endif
+
+            case GGML_TYPE_Q8_0:
+                if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
+                    return false;
+#if defined(__AVX2__) || defined(__AVX512F__)
+                return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q8_0, block_q8_0, TC>,
+                              block_q8_0, block_q8_0, TC>();
+#elif defined(__ARM_FEATURE_DOTPROD)
+                return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q8_0, block_q8_0, TC>,
+                              block_q8_0, block_q8_0, TC>();
+#else
+                return false;
+#endif
+
+            default:
+                return false;
+        }
+    }
+
+    template <int KN, int BS, typename BLAS, typename TA, typename TB, typename TC>
+    bool mixmat() {
+        if (cols % KN)
+            return false;
+        switch (params->type) {
+            case GGML_TASK_TYPE_INIT:
+                if (thought->type != ggml_type_trait<TB>::id)
+                    quantize_thought(ggml_type_trait<TB>::id);
+                build_row_pointers(ggml_type_trait<TB>::id);
+                return true;
+            case GGML_TASK_TYPE_COMPUTE:
+                assert(!(cols % BS));
+                assert(!(weights->nb[1] % sizeof(TA)));
+                for (int expert = 0; expert < experts; ++expert) {
+                    BLAS tb{cols / BS,
+                            (const TA*)((const char*)weights->data + expert * weights->nb[2]),
+                            (long)(weights->nb[1] / sizeof(TA)),
+                            (const TB*)(rowptr_thought_ + expert * tokens * thinkers),
+                            0,
+                            (TC*)(rowptr_result_ + expert * tokens * thinkers),
+                            0,
+                            params->ith,
+                            params->nth};
+                    tb.matmul(rows, rowptr_count_[expert], GGML_TASK_TYPE_COMPUTE);
+                }
+                return true;
+            default:
+                return true;
+        }
+    }
+
+    void build_row_pointers(ggml_type vec_dot_type) {
+        for (int expert = params->ith; expert < experts; expert += params->nth) {
+            long count = 0;
+            for (long token = 0; token < tokens; ++token)
+                for (int thinker = 0; thinker < thinkers; ++thinker)
+                    if (expert == *(const int32_t*)((const char*)plan->data +
+                                                    token * plan->nb[1] + thinker * plan->nb[0])) {
+                        long row = count++;
+                        long idx = expert * thinkers * tokens + row;
+                        rowptr_result_[idx] =
+                            (uintptr_t)((char*)result->data + token * result->nb[2] +
+                                        thinker * result->nb[1]);
+                        if (thought->type == vec_dot_type)
+                            rowptr_thought_[idx] =
+                                (uintptr_t)((char*)thought->data + token * thought->nb[2] +
+                                            thinker % tasks * thought->nb[1]);
+                        else
+                            rowptr_thought_[idx] =
+                                (uintptr_t)((char*)quantized_thought_ + token * tasks * ldq +
+                                            thinker % tasks * ldq);
+                    }
+            rowptr_count_[expert] = count;
+        }
+    }
+
+    void quantize_thought(ggml_type vec_dot_type) {
+        long chore = 0;
+        for (long token = 0; token < tokens; ++token)
+            for (int task = 0; task < tasks; ++task)
+                if (chore++ % params->nth == params->ith)
+                    quantize_row(quantized_thought_ + token * tasks * ldq + task * ldq,
+                                 (const float*)((const char*)thought->data +
+                                                token * thought->nb[2] + task * thought->nb[1]),
+                                 vec_dot_type);
+    }
+
+    void quantize_row(void* dst, const float* src, ggml_type type) {
+        assert((long)ggml_row_size(type, cols) <= ldq);
+        switch (type) {
+            case GGML_TYPE_F16:
+                ggml_fp32_to_fp16_row(src, (ggml_fp16_t*)dst, cols);
+                break;
+            case GGML_TYPE_BF16:
+                ggml_fp32_to_bf16_row(src, (ggml_bf16_t*)dst, cols);
+                break;
+            case GGML_TYPE_Q8_0:
+                quantize_row_q8_0((const float*)src, (block_q8_0*)dst, cols);
+                break;
+            default:
+                GGML_UNREACHABLE();
+        }
+    }
+
+    template <typename T>
+    T* allocate(size_t align, size_t elems) {
+        T* res = nullptr;
+        size_t need = sizeof(T) * elems;
+        size_t base = allocated_;
+        base += align - 1;
+        base &= -align;
+        size_t toto = base + need;
+        if (toto >= allocated_ && toto <= params->wsize) {
+            res = (T*)(wdata_ + base);
+            allocated_ = toto;
+        }
+        return res;
+    }
+
+    const ggml_compute_params* const params;
+    const ggml_tensor* const weights;
+    const ggml_tensor* const thought;
+    const ggml_tensor* const plan;
+    ggml_tensor* const result;
+    const long rows;
+    const long cols;
+    const int experts;
+    const int thinkers;
+    const int tasks;
+    const long tokens;
+    const long ldq;
+
+    // variables
+    char* const wdata_;
+    size_t allocated_;
+
+    // shared memory
+    long* rowptr_count_ /*[experts]*/;
+    char* quantized_thought_ /*[tokens][tasks][cols][2]*/;
+    uintptr_t* rowptr_result_ /*[experts][tokens*thinkers]*/;
+    uintptr_t* rowptr_thought_ /*[experts][tokens*thinkers]*/;
+};
+
+}  // namespace
+
+/**
+ * Performs "mixture of experts" tensor multiplication on CPU.
+ */
+bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
+    MixMul mm{params, weights, thought, plan, result};
+    return mm.allocate_shared_memory() && mm.mixmul();
+}
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
@@ -0,0 +1,24 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_avx
+#include "tinyblas_cpu_mixmul.inc"
+
+/**
+ * Returns number of shared memory bytes llamafile_mixmul() needs.
+ */
+size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
+    ggml_compute_params params{};
+    params.wsize = 0x7ffff000;
+    params.wdata = (void*)0x1000;
+    MixMul mm{&params, weights, thought, plan, 0};
+    if (mm.allocate_shared_memory())
+        return mm.get_allocated_bytes();
+    else
+        return 0;
+}
+
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_avx2
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_avx512f
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_fma
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_mixmul llamafile_mixmul_amd_zen4
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
@@ -0,0 +1,24 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#ifdef __aarch64__
+#define llamafile_mixmul llamafile_mixmul_arm80
+#include "tinyblas_cpu_mixmul.inc"
+
+/**
+ * Returns number of shared memory bytes llamafile_mixmul() needs.
+ */
+size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
+    ggml_compute_params params{};
+    params.wsize = 0x7ffff000;
+    params.wdata = (void*)0x1000;
+    MixMul mm{&params, weights, thought, plan, 0};
+    if (mm.allocate_shared_memory())
+        return mm.get_allocated_bytes();
+    else
+        return 0;
+}
+
+#endif  // __aarch64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#ifdef __aarch64__
+#define llamafile_mixmul llamafile_mixmul_arm82
+#include "tinyblas_cpu_mixmul.inc"
+#endif  // __aarch64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm.inc
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm.inc
@@ -0,0 +1,361 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tinyblas_cpu.h"
+
+//
+//
+//                                ██████╗ ██╗   █████╗ ██████╗
+//         ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║  ██╔══██╗██╔═══╝
+//         ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║  ███████║██████╗
+//           ██║  ██║██▀███║╚███╔╝██╔══██╗██║  ██╔══██║╔═══██║
+//           ██║  ██║██║ ██║ ███║ ██████╔╝████╗██║  ██║██████║
+//           ╚═╝  ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝  ╚═╝╚═════╝
+//
+//                   BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
+namespace {
+
+template <typename TC>
+bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
+    switch (Atype) {
+        case GGML_TYPE_F32: {
+            if (Btype != GGML_TYPE_F32)
+                return NOT_SUPPORTED;
+#if defined(__AVX512F__)
+            if (k % 16)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{
+                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__AVX__) || defined(__AVX2__)
+            if (k % 8)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{
+                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__ARM_NEON)
+            if (k % 4)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{
+                k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#else
+            return NOT_SUPPORTED;
+#endif
+        }
+
+        case GGML_TYPE_BF16: {
+#if defined(__AVX512BF16__)
+            if (k % 32)
+                return NOT_SUPPORTED;
+            if (Btype == GGML_TYPE_F32 && n < 2) {
+                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
+                    k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            }
+            if (Btype == GGML_TYPE_F32)
+                return WANT_QUANTIZATION;
+            if (Btype != GGML_TYPE_BF16)
+                return NOT_SUPPORTED;
+            if (!FLAG_precise) {
+                tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{
+                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            } else {
+                tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{
+                    k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            }
+#elif defined(__AVX512F__)
+            if (k % 16)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
+                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__AVX2__)
+            if (k % 8)
+                return NOT_SUPPORTED;
+            if (Btype != GGML_TYPE_F32)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{
+                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+            if (k % 4)
+                return NOT_SUPPORTED;
+            if (Btype != GGML_TYPE_F32)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{
+                k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#else
+            return NOT_SUPPORTED;
+#endif
+        }
+
+        case GGML_TYPE_F16: {
+#if defined(__AVX512F__)
+            if (k % 16)
+                return NOT_SUPPORTED;
+            if (Btype == GGML_TYPE_F32 && n < 2) {
+                tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
+                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            }
+            if (Btype == GGML_TYPE_F32)
+                return WANT_QUANTIZATION;
+            if (Btype != GGML_TYPE_F16)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{
+                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
+            // if (X86_CHECK(F16C)) {
+            if (k % 8)
+                return NOT_SUPPORTED;
+            if (Btype == GGML_TYPE_F32 && n < 2) {
+                tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
+                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            }
+            if (Btype == GGML_TYPE_F32)
+                return WANT_QUANTIZATION;
+            if (Btype != GGML_TYPE_F16)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{
+                k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+            // } else {
+            //     return NOT_SUPPORTED;
+            // }
+#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+            if (n < 2 && !FLAG_precise)
+                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
+                return NOT_SUPPORTED;
+            if (precision == GGML_PREC_F32) {
+                if (k % 4)
+                    return NOT_SUPPORTED;
+                if (Btype != GGML_TYPE_F32)
+                    return NOT_SUPPORTED;
+                tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
+                    k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            } else {
+                if (k % 8)
+                    return NOT_SUPPORTED;
+                if (Btype == GGML_TYPE_F32)
+                    return WANT_QUANTIZATION;
+                if (Btype != GGML_TYPE_F16)
+                    return NOT_SUPPORTED;
+                tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{
+                    k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n, task);
+                return true;
+            }
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+            if (n < 2 && !FLAG_precise)
+                // TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
+                return NOT_SUPPORTED;
+            if (k % 4)
+                return NOT_SUPPORTED;
+            if (Btype != GGML_TYPE_F32)
+                return NOT_SUPPORTED;
+            tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
+                k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#else
+            return NOT_SUPPORTED;
+#endif
+        }
+
+        case GGML_TYPE_Q8_0: {
+            if (Btype == GGML_TYPE_F32)
+                return WANT_QUANTIZATION;
+            if (Btype != GGML_TYPE_Q8_0)
+                return NOT_SUPPORTED;
+#if defined(__AVX2__) || defined(__AVX512F__)
+            tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{
+                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__ARM_FEATURE_DOTPROD)
+            tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{
+                k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#else
+            return NOT_SUPPORTED;
+#endif
+        }
+
+        case GGML_TYPE_Q4_0: {
+            if (Btype == GGML_TYPE_F32)
+                return WANT_QUANTIZATION;
+            if (Btype != GGML_TYPE_Q8_0)
+                return NOT_SUPPORTED;
+#if defined(__AVX2__) || defined(__AVX512F__)
+            tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{
+                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#elif defined(__ARM_FEATURE_DOTPROD)
+            tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{
+                k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n, task);
+            return true;
+#else
+            return NOT_SUPPORTED;
+#endif
+        }
+
+        default:
+            return NOT_SUPPORTED;
+    }
+
+    (void)m;
+    (void)n;
+    (void)k;
+    (void)A;
+    (void)lda;
+    (void)B;
+    (void)ldb;
+    (void)C;
+    (void)ldc;
+    (void)ith;
+    (void)nth;
+    (void)Atype;
+    (void)Btype;
+    (void)precision;
+}
+
+}  // namespace
+
+/**
+ * Performs optimized matrix multiplication on CPU.
+ *
+ * This subroutine may compute C = Aᵀ * B with column major ordering.
+ * Despite its name, this isn't a generalized implementation. Work is
+ * only performed when a handwritten kernel is written and available.
+ * Otherwise the caller should fall back to a general matmul routine.
+ *
+ * For example, for single-threaded single-precision GEMM you can say
+ *
+ *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1,
+ *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
+ *                     GGML_PREC_DEFAULT);
+ *
+ * @param m is rows in `A` and `C`
+ * @param n is cols in `B` and `C`
+ * @param k is cols in `A` and rows in `B`
+ * @param A is first input matrix (always transposed)
+ * @param lda is row stride of `A`
+ * @param B is second input matrix (never transposed)
+ * @param ldb is row stride of `B`
+ * @param C is input/output array of output matrices
+ * @param ldc is row stride of `C`
+ * @param ith is thread id (must be less than `nth`)
+ * @param nth is number of threads (must be greater than zero)
+ * @param Atype is GGML data type of `A`
+ * @param Btype is GGML data type of `B`
+ * @param Ctype is GGML data type of `C`
+ * @param precision may be used to control the internal compute type
+ * @return true if this function was able to service the matmul request
+ */
+bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
+    assert(m >= 0);
+    assert(n >= 0);
+    assert(k >= 0);
+    assert(lda >= k);
+    assert(ldb >= k);
+    assert(ldc >= m);
+    assert(nth > 0);
+    assert(ith < nth);
+
+#if QK_K == 256
+#if defined(__x86_64__) || defined(_M_X64)
+#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
+    /* 
+    moonll
+    more Btype accept
+    }*/
+
+    if (Ctype == GGML_TYPE_F32){
+        if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
+            return true;
+        }
+    }
+
+#endif
+#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
+    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
+        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
+            return true;
+        }
+    }
+    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
+        // assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
+        assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
+        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
+            return true;
+        }
+    }
+#endif
+#endif
+
+    switch (Ctype) {
+        case GGML_TYPE_F32:
+            return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype,
+                                        Btype, Ctype, precision);
+        default:
+            return NOT_SUPPORTED;
+    }
+}
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_avx
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_avx2
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_avx512f
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
@@ -0,0 +1,9 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_fma
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
@@ -0,0 +1,10 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define llamafile_sgemm llamafile_sgemm_amd_zen4
+#define iqk_mul_mat iqk_mul_mat_zen4
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __x86_64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
@@ -0,0 +1,9 @@
+// // Adapted from
+// // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp
+// // Copyrigth 2024 Mozilla Foundation.
+// // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// #ifdef __aarch64__
+// #define llamafile_sgemm llamafile_sgemm_arm80
+// #include "tinyblas_cpu_sgemm.inc"
+// #endif  // __aarch64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
@@ -0,0 +1,10 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+#ifdef __aarch64__
+#define llamafile_sgemm llamafile_sgemm_arm82
+#define iqk_mul_mat iqk_mul_mat_arm82
+#include "tinyblas_cpu_sgemm.inc"
+#endif  // __aarch64__
--- a/kt-kernel/third_party/llamafile/tinyblas_cpu_unsupported.cpp
+++ b/kt-kernel/third_party/llamafile/tinyblas_cpu_unsupported.cpp
@@ -0,0 +1,39 @@
+// Adapted from
+// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp
+// Copyrigth 2024 Mozilla Foundation.
+// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
+
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sgemm.h"
+
+bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
+    return false;
+}
+
+bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params,
+                                  const struct ggml_tensor* weights,
+                                  const struct ggml_tensor* thought,
+                                  const struct ggml_tensor* plan,
+                                  struct ggml_tensor* result) {
+    return false;
+}
+
+bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) {
+    return false;
+}
				`@@ -0,0 +1 @@`
				`The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.`